changeset 7:6f8f338686db

Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
author bengioy@bengiomac.local
date Mon, 24 Mar 2008 13:20:15 -0400
parents d5738b79089a
children d1c394486037
files _test_dataset.py dataset.py
diffstat 2 files changed, 100 insertions(+), 83 deletions(-) [+]
line wrap: on
line diff
--- a/_test_dataset.py	Mon Mar 24 09:04:06 2008 -0400
+++ b/_test_dataset.py	Mon Mar 24 13:20:15 2008 -0400
@@ -16,14 +16,24 @@
         a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)},minibatch_size=1)
         s=0
         for example in a:
+            print len(example), example.x
             s+=_sum_all(example.x)
         print s
-        self.failUnless(abs(s-11.4674133)<1e-6)
+        self.failUnless(abs(s-7.25967597)<1e-6)
+
+    def test1(self):
+        a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)},minibatch_size=1)
         a.minibatch_size=2
+        print a.asarray()
         for mb in a:
+            print mb,mb.asarray()
+        print "a.y=",a.y
+        for mb in ArrayDataSet(data=a.y,minibatch_size=2):
             print mb
-            
-
+            for e in mb:
+                print e
+        self.failUnless(True)
+        
 if __name__ == '__main__':
     unittest.main()
     
--- a/dataset.py	Mon Mar 24 09:04:06 2008 -0400
+++ b/dataset.py	Mon Mar 24 13:20:15 2008 -0400
@@ -20,14 +20,11 @@
         self.minibatch_size=minibatch_size
 
     def __iter__(self):
-        return self
-
-    def next(self):
         """
-        Return the next example or the next minibatch in the dataset.
-        A minibatch (of length > 1) should be something one can iterate on again in order
-        to obtain the individual examples. If the dataset has fields,
-        then the example or the minibatch must have the same fields
+        Return an iterator, whose next() method returns the next example or the next 
+        minibatch in the dataset. A minibatch (of length > 1) should be something one 
+        can iterate on again in order to obtain the individual examples. If the dataset 
+        has fields, then the example or the minibatch must have the same fields
         (typically this is implemented by returning another (small) dataset, when
         there are fields).
         """
@@ -55,6 +52,9 @@
     def __init__(self,minibatch_size):
         DataSet.__init__(self,minibatch_size)
 
+    def __iter__(self):
+        return FiniteDataSetIterator(self)
+    
     def __len__(self):
         """len(dataset) returns the number of examples in the dataset."""
         raise NotImplementedError
@@ -67,6 +67,35 @@
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
         raise NotImplementedError
 
+class FiniteDataSetIterator(object):
+    def __init__(self,dataset):
+        self.dataset=dataset
+        self.current = -self.dataset.minibatch_size
+        
+    def next(self):
+        """
+        Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that
+        many examples. If the dataset has fields, the example or the minibatch of examples
+        is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
+        but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
+        example-wise on it. On the other hand, if the dataset has no fields (e.g. because
+        it is already the field of a bigger dataset), then the returned example or minibatch
+        may be any indexable object, such as a numpy array. Following the array semantics of indexing
+        and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array
+        with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
+        to a row. Again, if the minibatch_size is >1, one can iterate on the result to
+        obtain individual examples (as rows).
+        """
+        self.current+=self.dataset.minibatch_size
+        if self.current>=len(self.dataset):
+            self.current=-self.dataset.minibatch_size
+            raise StopIteration
+        if self.dataset.minibatch_size==1:
+            return self.dataset[self.current]
+        else:
+            return self.dataset[self.current:self.current+self.dataset.minibatch_size]
+
+
 # we may want ArrayDataSet defined in another python file
 
 import numpy
@@ -88,7 +117,6 @@
         a dictionary of column slices indexed by field names).
         """
         FiniteDataSet.__init__(self,minibatch_size)
-        self.current_row=-1 # used for view of this dataset as an iterator
         if dataset!=None:
             assert data==None and fields=={}
             # convert dataset to an ArrayDataSet
@@ -108,43 +136,20 @@
                 if not step:
                     step=1
                 if not fieldslice.start or not fieldslice.step:
-                    fieldslice = slice(start,fieldslice.stop,step)
+                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                 # and coherent with the data array
                 assert fieldslice.start>=0 and fieldslice.stop<=self.width
         assert minibatch_size<=len(self.data)
 
-    def next(self):
-        """
-        Return the next example(s) in the dataset. If self.minibatch_size>1 return that
-        many examples. If the dataset has fields, the example or the minibatch of examples
-        is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
-        but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
-        example-wise on it. On the other hand, if the dataset has no fields (e.g. because
-        it is already the field of a bigger dataset), then the returned example or minibatch
-        is a numpy array. Following the array semantics of indexing and slicing,
-        if the minibatch_size is 1 (and there are no fields), then the result is an array
-        with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
-        to a row. Again, if the minibatch_size is >1, one can iterate on the result to
-        obtain individual examples (as rows).
+    def __getattr__(self,fieldname):
         """
-        if self.fields:
-            self.current_row+=self.minibatch_size
-            if self.current_row>=len(self.data):
-                self.current_row=-self.minibatch_size
-                raise StopIteration
-            if self.minibatch_size==1:
-                return self[self.current_row]
-            else:
-                return self[self.current_row:self.current_row+self.minibatch_size]
-        else:
-            if self.minibatch_size==1:
-                return self.data[self.current_row]
-            else:
-                return self.data[self.current_row:self.current_row+self.minibatch_size]
-
-    def __getattr__(self,fieldname):
-        """Return a numpy array with the content associated with the given field name."""
-        return self.data[self.fields[fieldname]]
+        Return a numpy array with the content associated with the given field name.
+        If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
+        than the dataset.data) is returned.
+        """
+        if len(self.data)==1:
+            return self.data[0,self.fields[fieldname]]
+        return self.data[:,self.fields[fieldname]]
 
     def __call__(self,*fieldnames):
         """Return a sub-dataset containing only the given fieldnames as fields."""
@@ -176,49 +181,51 @@
                 return ArrayDataSet(data=data[slice],fields=self.fields)
             return ArrayDataSet(data=self.data[i:i+1],fields=self.fields)
         else:
-            return data[i]
+            return self.data[i]
 
     def __getslice__(self,*slice_args):
         """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
         return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
 
     def asarray(self):
-        if self.fields:
-            columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
-            for field_slice in self.fields.values():
-                for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
-                    columns_used[c]=True
-            # try to figure out if we can map all the slices into one slice:
-            mappable_to_one_slice = True
-            start=0
-            while start<len(columns_used) and not columns_used[start]:
-                start+=1
-            stop=len(columns_used)
-            while stop>0 and not columns_used[stop-1]:
-                stop-=1
-            step=0
-            i=start
-            while i<stop:
-                j=i+1
-                while not columns_used[j] and j<stop:
-                    j+=1
-                if step:
-                    if step!=j-i:
-                        mappable_to_one_slice = False
-                        break
-                else:
-                    step = j-i
-            if mappable_to_one_slice:
-                return data[slice(start,stop,step)]
-            # else make contiguous copy
-            n_columns = sum(columns_used)
-            result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
-            c=0
-            for field_slice in self.fields.values():
-               slice_width=field_slice.stop-field_slice.start/field_slice.step
-               # copy the field here
-               result[:,slice(c,slice_width)]=self.data[field_slice]
-               c+=slice_width
-            return result
-        return self.data
-
+        if not self.fields:
+            return self.data
+        # else, select subsets of columns mapped by the fields
+        columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
+        for field_slice in self.fields.values():
+            for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
+                columns_used[c]=True
+        # try to figure out if we can map all the slices into one slice:
+        mappable_to_one_slice = True
+        start=0
+        while start<len(columns_used) and not columns_used[start]:
+            start+=1
+        stop=len(columns_used)
+        while stop>0 and not columns_used[stop-1]:
+            stop-=1
+        step=0
+        i=start
+        while i<stop:
+            j=i+1
+            while j<stop and not columns_used[j]:
+                j+=1
+            if step:
+                if step!=j-i:
+                    mappable_to_one_slice = False
+                    break
+            else:
+                step = j-i
+            i=j
+        if mappable_to_one_slice:
+            return self.data[:,slice(start,stop,step)]
+        # else make contiguous copy
+        n_columns = sum(columns_used)
+        result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
+        print result.shape
+        c=0
+        for field_slice in self.fields.values():
+            slice_width=field_slice.stop-field_slice.start/field_slice.step
+            # copy the field here
+            result[:,slice(c,slice_width)]=self.data[:,field_slice]
+            c+=slice_width
+        return result