changeset 22:b6b36f65664f

Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet, removed .field ability from LookupList (because of setattr problems), removed fieldNames() from DataSet (but is in FiniteWidthDataSet, where it makes sense), and added hasFields() instead. Fixed problems in asarray, and tested previous functionality in _test_dataset.py, but not yet new functionality.
author bengioy@esprit.iro.umontreal.ca
date Mon, 07 Apr 2008 20:44:37 -0400
parents fdf0abc490f7
children 526e192b0699
files _test_dataset.py dataset.py lookup_list.py
diffstat 3 files changed, 177 insertions(+), 173 deletions(-) [+]
line wrap: on
line diff
--- a/_test_dataset.py	Mon Apr 07 19:32:52 2008 -0400
+++ b/_test_dataset.py	Mon Apr 07 20:44:37 2008 -0400
@@ -50,10 +50,13 @@
 
     def test_asarray(self):
         arr = numpy.random.rand(3,4)
+        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(2,4)})
+        a_arr = numpy.asarray(a)
+        self.failUnless(a_arr.shape[1] == 2 + 2)
+        self.failUnless(numpy.sum(numpy.square(a_arr-a.data))==0)
         a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
         a_arr = numpy.asarray(a)
         self.failUnless(a_arr.shape[1] == 2 + 3)
-        self.failUnless(a_arr == arr)
 
     def test_minibatch_wraparound_even(self):
         arr = numpy.random.rand(10,4)
--- a/dataset.py	Mon Apr 07 19:32:52 2008 -0400
+++ b/dataset.py	Mon Apr 07 20:44:37 2008 -0400
@@ -10,7 +10,7 @@
     A DataSet is a generator of iterators; these iterators can run through the
     examples in a variety of ways.  A DataSet need not necessarily have a finite
     or known length, so this class can be used to interface to a 'stream' which
-    feeds on-line learning.
+    feeds on-line learning. 
 
     To iterate over examples, there are several possibilities:
     - for example in dataset.zip([field1, field2,field3, ...])
@@ -19,29 +19,49 @@
     - for example in dataset
     Each of these is documented below.
 
-    Note: For a dataset of fixed and known length, which can implement item
-    random-access efficiently (e.g. indexing and slicing), and which can profit
-    from the FiniteDataSetIterator, consider using base class FiniteDataSet.
-
     Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
 
     Note: The content of a field can be of any type.
 
+    Note: A dataset can recognize a potentially infinite number of field names (i.e. the field
+    values can be computed on-demand, when particular field names are used in one of the
+    iterators).
+
+    Datasets of finite length should be sub-classes of FiniteLengthDataSet.
+
+    Datasets whose elements can be indexed and sub-datasets of consecutive
+    examples (i.e. slices) can be extracted from should be sub-classes of
+    SliceableDataSet.
+
+    Datasets with a finite number of fields should be sub-classes of
+    FiniteWidthDataSet.
     """
 
     def __init__(self):
         pass
     
+    class Iter(LookupList):
+        def __init__(self, ll):
+            LookupList.__init__(self, ll.keys(), ll.values())
+            self.ll = ll
+        def __iter__(self): #makes for loop work
+            return self
+        def next(self):
+            self.ll.next()
+            self._values = [v[0] for v in self.ll._values]
+            return self
+
     def __iter__(self):
         """Supports the syntax "for i in dataset: ..."
 
         Using this syntax, "i" will be an Example instance (or equivalent) with
         all the fields of DataSet self.  Every field of "i" will give access to
         a field of a single example.  Fields should be accessible via
-        i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free
+        i["fielname"] or i[3] (in the order defined by the elements of the
+        Example returned by this iterator), but the derived class is free
         to accept any type of identifier, and add extra functionality to the iterator.
         """
-        return self.zip(*self.fieldNames())
+        return DataSet.Iter(self.minibatches(None, minibatch_size = 1))
 
     def zip(self, *fieldnames):
         """
@@ -61,17 +81,7 @@
         The derived class may accept fieldname arguments of any type.
 
         """
-        class Iter(LookupList):
-            def __init__(self, ll):
-                LookupList.__init__(self, ll.keys(), ll.values())
-                self.ll = ll
-            def __iter__(self): #makes for loop work
-                return self
-            def next(self):
-                self.ll.next()
-                self._values = [v[0] for v in self.ll._values]
-                return self
-        return Iter(self.minibatches(fieldnames, minibatch_size = 1))
+        return DataSet.Iter(self.minibatches(fieldnames, minibatch_size = 1))
 
     minibatches_fieldnames = None
     minibatches_minibatch_size = 1
@@ -81,18 +91,25 @@
             minibatch_size = minibatches_minibatch_size,
             n_batches = minibatches_n_batches):
         """
-        Supports two forms of syntax:
+        Supports three forms of syntax:
+
+            for i in dataset.minibatches(None,**kwargs): ...
 
             for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
 
             for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
 
-        Using the first syntax, "i" will be an indexable object, such as a list,
-        tuple, or Example instance, such that on every iteration, i[0] is a
+        Using the first two syntaxes, "i" will be an indexable object, such as a list,
+        tuple, or Example instance. In both cases, i[k] is a list-like container
+        of a batch of current examples. In the second case, i[0] is
         list-like container of the f1 field of a batch current examples, i[1] is
         a list-like container of the f2 field, etc.
 
-        Using the second syntax, i1, i2, i3 will be list-like containers of the
+        Using the first syntax, all the fields will be returned in "i".
+        Beware that some datasets may not support this syntax, if the number
+        of fields is infinite (i.e. field values may be computed "on demand").
+
+        Using the third syntax, i1, i2, i3 will be list-like containers of the
         f1, f2, and f3 fields of a batch of examples on each loop iteration.
 
         PARAMETERS
@@ -115,35 +132,15 @@
 
         """
         raise AbstractFunction()
-    
-    def fieldNames(self):
-        #Yoshua- 
-        # This list may not be finite; what would make sense in the use you have
-        # in mind?
-        # -JB
-        #James-
-        # You are right. I had put this to be able to iterate over the fields
-        # but maybe an iterator mechanism (over fields rather than examples)
-        # would be more appropriate. Fieldnames are needed in general
-        # by the iterators over examples or minibatches, to construct
-        # examples or minibatches with the corresponding names as attributes.
-        # -YB
+
+    def hasFields(*fieldnames):
         """
-        Return an iterator (an object with an __iter__ method) that
-        iterates over the names of the fields. As a special cases,
-        a list or a tuple of field names can be returned.
-        """"
-        # Note that some datasets
-        # may have virtual fields and support a virtually infinite number
-        # of possible field names. In that case, fieldNames() should
-        # either raise an error or iterate over a particular set of
-        # names as appropriate. Another option would be to iterate
-        # over the sub-datasets comprising a single field at a time.
-        # I am not sure yet what is most appropriate.
-        #  -YB
+        Return true if the given field name (or field names, if multiple arguments are
+        given) is recognized by the DataSet (i.e. can be used as a field name in one
+        of the iterators).
         """
         raise AbstractFunction()
-
+        
     def rename(*new_field_specifications):
         #Yoshua- 
         # Do you mean for this to be a virtual method?
@@ -165,7 +162,7 @@
         raise AbstractFunction()
 
 
-    def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+    def applyFunction(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
         """
         Return a dataset that contains as fields the results of applying
         the given function (example-wise) to the specified input_fields. The
@@ -202,85 +199,43 @@
         new_fieldnames = [dct.get(f, f) for f in fieldnames]
         return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
 
-    def fieldNames(self):
-        return [dct.get(f, f) for f in self.src.fieldNames()]
-
-
-class FiniteDataSet(DataSet):
+class FiniteLengthDataSet(DataSet):
     """
-    Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
-    Examples are indexed by an integer between 0 and self.length()-1,
-    and a subdataset can be obtained by slicing. This may not be appropriate in general
-    but only for datasets which can be thought of like ones that access rows AND fields
-    in an efficient random access way. Users are encouraged to expect only the generic dataset
-    interface in general. A FiniteDataSet is mainly useful when one has to obtain
-    a subset of examples (e.g. for splitting a dataset into training and test sets).
+    Virtual interface for datasets that have a finite length (number of examples),
+    and thus recognize a len(dataset) call.
     """
+    def __init__(self):
+        DataSet.__init__(self)
 
-    class FiniteDataSetIterator(object):
-        """
-        If the fieldnames list is empty, it means that we want to see ALL the fields.
-        """
-        def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
-            self.dataset=dataset
-            self.minibatch_size=minibatch_size
-            assert minibatch_size>=1 and minibatch_size<=len(dataset)
-            self.current = -self.minibatch_size
-            self.fieldnames = fieldnames
-            if len(dataset) % minibatch_size:
-                raise NotImplementedError()
-
-        def __iter__(self):
-            return self
+    def __len__(self):
+        """len(dataset) returns the number of examples in the dataset."""
+        raise AbstractFunction()
+    
+                 
+class SliceableDataSet(DataSet):
+    """
+    Virtual interface, a subclass of DataSet for datasets which are sliceable
+    and whose individual elements can be accessed, generally respecting the
+    python semantics for [spec], where spec is either a non-negative integer
+    (for selecting one example), or a python slice (for selecting a sub-dataset
+    comprising the specified examples). This is useful for obtaining
+    sub-datasets, e.g. for splitting a dataset into training and test sets.
+    """
+    def __init__(self):
+        DataSet.__init__(self)
         
-        def next(self):
-            self.current+=self.minibatch_size
-            if self.current>=len(self.dataset):
-                self.current=-self.minibatch_size
-                raise StopIteration
-            if self.minibatch_size==1:
-                complete_example=self.dataset[self.current]
-            else:
-                complete_example=self.dataset[self.current:self.current+self.minibatch_size]
-            if self.fieldnames:
-                return Example(self.fieldnames,list(complete_example))
-            else:
-                return complete_example
-
-    def __init__(self):
-        pass
-
     def minibatches(self,
             fieldnames = DataSet.minibatches_fieldnames,
             minibatch_size = DataSet.minibatches_minibatch_size,
             n_batches = DataSet.minibatches_n_batches):
         """
-        If the fieldnames list is empty, it means that we want to see ALL the fields.
-
         If the n_batches is empty, we want to see all the examples possible
-        for the give minibatch_size.
+        for the given minibatch_size (possibly missing a few at the end of the dataset).
         """
         # substitute the defaults:
-        if fieldnames is None: fieldnames = self.fieldNames()
         if n_batches is None: n_batches = len(self) / minibatch_size
         return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
-    def __getattr__(self,fieldname):
-        """Return an that can iterate over the values of the field in this dataset."""
-        return self(fieldname)
-
-    def __call__(self,*fieldnames):
-        """Return a sub-dataset containing only the given fieldnames as fields.
-        
-        The return value's default iterator will iterate only over the given
-        fields.
-        """
-        raise AbstractFunction()
-
-    def __len__(self):
-        """len(dataset) returns the number of examples in the dataset."""
-        raise AbstractFunction()
-    
     def __getitem__(self,i):
         """dataset[i] returns the (i+1)-th example of the dataset."""
         raise AbstractFunction()
@@ -289,6 +244,28 @@
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
         raise AbstractFunction()
 
+
+class FiniteWidthDataSet(DataSet):
+    """
+    Virtual interface for datasets that have a finite width (number of fields),
+    and thus return a list of fieldNames. 
+    """
+    def __init__(self):
+        DataSet.__init__(self)
+
+    def hasFields(*fieldnames):
+        has_fields=True
+        for fieldname in fieldnames:
+            if fieldname not in self.fields.keys():
+                has_fields=False
+        return has_fields
+                
+    def fieldNames(self):
+        """Return the list of field names that are supported by the iterators,
+        and for which hasFields(fieldname) would return True."""
+        raise AbstractFunction()
+
+
 # we may want ArrayDataSet defined in another python file
 
 import numpy
@@ -326,7 +303,7 @@
     #  - decide what to do with extra dimensions if not the same in all fields
     #  - try to see if we can avoid the copy?
 
-class ArrayDataSet(FiniteDataSet):
+class ArrayDataSet(FiniteLengthDataSet,FiniteWidthDataSet,SliceableDataSet):
     """
     An ArrayDataSet behaves like a numpy array but adds the notion of named fields
     from DataSet (and the ability to view the values of multiple fields as an 'Example').
@@ -342,7 +319,8 @@
     class Iterator(LookupList):
         """An iterator over a finite dataset that implements wrap-around"""
         def __init__(self, dataset, fieldnames, minibatch_size, next_max):
-            LookupList.__init__(self, fieldnames, [0] * len(fieldnames))
+            if fieldnames is None: fieldnames = dataset.fieldNames()
+            LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
             self.dataset=dataset
             self.minibatch_size=minibatch_size
             self.next_count = 0
@@ -392,10 +370,8 @@
                 assert upper > 0
                 dataview = self.matcat(dataview, data[:upper])
 
-
             self._values = [dataview[:, self.dataset.fields[f]]\
                     for f in self._names]
-
             return self
 
 
@@ -429,13 +405,12 @@
             minibatch_size = DataSet.minibatches_minibatch_size,
             n_batches = DataSet.minibatches_n_batches):
         """
-        If the fieldnames list is empty, it means that we want to see ALL the fields.
+        If the fieldnames list is None, it means that we want to see ALL the fields.
 
-        If the n_batches is empty, we want to see all the examples possible
-        for the give minibatch_size.
+        If the n_batches is None, we want to see all the examples possible
+        for the given minibatch_size (possibly missing some near the end).
         """
         # substitute the defaults:
-        if fieldnames is None: fieldnames = self.fieldNames()
         if n_batches is None: n_batches = len(self) / minibatch_size
         return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
@@ -462,7 +437,7 @@
         return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
 
     def fieldNames(self):
-        """Return the list of field names that are supported by getattr and getFields."""
+        """Return the list of field names that are supported by getattr and hasField."""
         return self.fields.keys()
 
     def __len__(self):
@@ -502,45 +477,48 @@
             return self.data
         # else, select subsets of columns mapped by the fields
         columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
+        overlapping_fields = False
+        n_columns = 0
         for field_slice in self.fields.values():
             for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
+                n_columns += 1
+                if columns_used[c]: overlapping_fields=True
                 columns_used[c]=True
         # try to figure out if we can map all the slices into one slice:
-        mappable_to_one_slice = True
-        start=0
-        while start<len(columns_used) and not columns_used[start]:
-            start+=1
-        stop=len(columns_used)
-        while stop>0 and not columns_used[stop-1]:
-            stop-=1
-        step=0
-        i=start
-        while i<stop:
-            j=i+1
-            while j<stop and not columns_used[j]:
-                j+=1
-            if step:
-                if step!=j-i:
-                    mappable_to_one_slice = False
-                    break
-            else:
-                step = j-i
-            i=j
+        mappable_to_one_slice = not overlapping_fields
+        if not overlapping_fields:
+            start=0
+            while start<len(columns_used) and not columns_used[start]:
+                start+=1
+            stop=len(columns_used)
+            while stop>0 and not columns_used[stop-1]:
+                stop-=1
+            step=0
+            i=start
+            while i<stop:
+                j=i+1
+                while j<stop and not columns_used[j]:
+                    j+=1
+                if step:
+                    if step!=j-i:
+                        mappable_to_one_slice = False
+                        break
+                else:
+                    step = j-i
+                i=j
         if mappable_to_one_slice:
             return self.data[:,slice(start,stop,step)]
-        # else make contiguous copy
-        n_columns = sum(columns_used)
-        result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
-        print result.shape
+        # else make contiguous copy (copying the overlapping columns)
+        result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
         c=0
         for field_slice in self.fields.values():
-            slice_width=field_slice.stop-field_slice.start/field_slice.step
+            slice_width=(field_slice.stop-field_slice.start)/field_slice.step
             # copy the field here
-            result[:,slice(c,slice_width)]=self.data[:,field_slice]
+            result[:,slice(c,c+slice_width)]=self.data[:,field_slice]
             c+=slice_width
         return result
 
-class ApplyFunctionDataset(DataSet):
+class ApplyFunctionDataSet(DataSet):
     """
     A dataset that contains as fields the results of applying
     a given function (example-wise) to specified input_fields of a source
@@ -583,17 +561,17 @@
                     fieldnames = DataSet.minibatches_fieldnames,
                     minibatch_size = DataSet.minibatches_minibatch_size,
                     n_batches = DataSet.minibatches_n_batches):
-
+        
         class Iterator(LookupList):
 
             def __init__(self,dataset):
-                LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
+                if fieldnames is None:
+                    LookupList.__init__(self, [],[])
+                else:
+                    LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
                 self.dataset=dataset
-                if dataset.copy_inputs:
-                    src_fields=dataset.fieldNames()
-                else:
-                    src_fields=dataset.input_fields
-                self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches)
+                self.src_iterator=self.src.minibatches(list(set.union(set(fieldnames),set(self.dataset.input_fields))),
+                                                       minibatch_size,n_batches)
                                                        
             def __iter__(self):
                 return self
@@ -603,12 +581,15 @@
                 if self.dataset.copy_inputs:
                     function_inputs = src_examples
                 else:
-                    function_inputs = 
-                    [src_examples[field_name] for field_name in self.dataset.input_fields])
-                return self.dataset.function(*function_inputs)
+                    function_inputs = [src_examples[field_name] for field_name in self.dataset.input_fields]
+                outputs = Example(self.dataset.output_fields,self.dataset.function(*function_inputs))
+                if self.dataset.copy_inputs:
+                    return src_examples + outputs
+                else:
+                    return outputs
 
         for fieldname in fieldnames:
-            assert fieldname in self.input_fields
+            assert fieldname in self.output_fields or self.src.hasFields(fieldname)
         return Iterator(self)
 
     
--- a/lookup_list.py	Mon Apr 07 19:32:52 2008 -0400
+++ b/lookup_list.py	Mon Apr 07 20:44:37 2008 -0400
@@ -1,3 +1,5 @@
+
+from copy import copy
 
 class LookupList(object):
     """
@@ -5,13 +7,18 @@
     a dictionary the order of the elements depends not on their key but
     on the order given by the user through construction) so that
     following syntactic constructions work as one would expect:
-       example = Example(['x','y','z'],[1,2,3])
-       example.x = [1, 2, 3] # set or change a field
+       example = LookupList(['x','y','z'],[1,2,3])
+       example['x'] = [1, 2, 3] # set or change a field
        x, y, z = example
        x = example[0]
        x = example["x"]
-       print example.keys() # returns ['x','y','z']
-       print example.values() # returns [[1,2,3],2,3]
+       print example.keys() # prints ['x','y','z']
+       print example.values() # prints [[1,2,3],2,3]
+       print example.items() # prints [('x',[1,2,3]),('y',2),('z',3)]
+       example.append_keyval('u',0) # adds item with name 'u' and value 0
+       print len(example) # number of items = 4 here
+       print example+example # addition is like for lists, a concatenation of the items.
+    Note that the element names should be unique.
     """
     def __init__(self,names=[],values=[]):
         assert len(values)==len(names)
@@ -19,6 +26,7 @@
         self.__dict__['_name2index']={}
         self.__dict__['_names']=names
         for i in xrange(len(values)):
+            assert names[i] not in self._name2index
             self._name2index[names[i]]=i
 
     def keys(self):
@@ -28,6 +36,9 @@
         return self._values
 
     def items(self):
+        """
+        Return a list of (name,value) pairs of all the items in the look-up list.
+        """
         return zip(self._names,self._values)
     
     def __getitem__(self,key):
@@ -47,15 +58,10 @@
             if key in self._name2index:
                 self._values[self._name2index[key]]=value
             else:
-                raise KeyError(key)
-
-    def __getattr__(self,name):
-        try:
-            return self._values[self._name2index[name]]
-        except KeyError, e:
-            raise AttributeError(name)
-
+                self.append_keyval(key,value)
+            
     def append_keyval(self, key, value):
+        assert key not in self._name2index
         self._name2index[key]=len(self)
         self._values.append(value)
         self._names.append(key)
@@ -65,3 +71,17 @@
 
     def __repr__(self):
         return "{%s}" % ", ".join([str(k) + "=" + repr(v) for k,v in self.items()])
+
+    def __add__(self,rhs):
+        new_example = copy(self)
+        for item in rhs.items():
+            new_example.append_keyval(item[0],item[1])
+        return new_example
+
+    def __radd__(self,lhs):
+        new_example = copy(lhs)
+        for item in self.items():
+            new_example.append_keyval(item[0],item[1])
+        return new_example
+
+