diff dataset.py @ 40:88fd1cce08b9

replaced infinity for length by raise UnboundedDataSet and use & instead of + to concatenate datasets
author bengioy@esprit.iro.umontreal.ca
date Fri, 25 Apr 2008 10:41:19 -0400
parents c682c6e9bf93
children 283e95c15b47
line wrap: on
line diff
--- a/dataset.py	Thu Apr 24 14:46:10 2008 -0400
+++ b/dataset.py	Fri Apr 25 10:41:19 2008 -0400
@@ -6,6 +6,7 @@
 
 class AbstractFunction (Exception): """Derived class must override this function"""
 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented"""
+class UnboundedDataSet (Exception): """Trying to obtain length of unbounded dataset (a stream)"""
 
 class DataSet(object):
     """A virtual base class for datasets.
@@ -15,7 +16,8 @@
     columns/attributes are called fields. The field value for a particular example can be an arbitrary
     python object, which depends on the particular dataset.
     
-    We call a DataSet a 'stream' when its length is unbounded (len(dataset)==float("infinity")).
+    We call a DataSet a 'stream' when its length is unbounded (otherwise its __len__ method
+    should raise an UnboundedDataSet exception).
 
     A DataSet is a generator of iterators; these iterators can run through the
     examples or the fields in a variety of ways.  A DataSet need not necessarily have a finite
@@ -27,6 +29,7 @@
     * for example in dataset([field1, field2,field3, ...]):
     * for val1,val2,val3 in dataset([field1, field2,field3]):
     * for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N):
+    * for mini1,mini2,mini3 in dataset.minibatches([field1, field2, ...],minibatch_size=N):
     * for example in dataset:
     Each of these is documented below. All of these iterators are expected
     to provide, in addition to the usual 'next()' method, a 'next_index()' method
@@ -82,7 +85,7 @@
     creates a new dataset whose list of fields is the concatenation of the list of
     fields of the argument datasets. This only works if they all have the same length.
 
-    * dataset1 + dataset2 + dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
+    * dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3])
 
     creates a new dataset that concatenates the examples from the argument datasets
     (and whose length is the sum of the length of the argument datasets). This only
@@ -93,20 +96,18 @@
     a DataSetFields fields1 and fields2, and fields1 | fields2 concatenates their
     examples.
 
-
     A DataSet sub-class should always redefine the following methods:
       * __len__ if it is not a stream
-      * __getitem__ may not be feasible with some streams
       * fieldNames
       * minibatches_nowrap (called by DataSet.minibatches())
       * valuesHStack
       * valuesVStack
     For efficiency of implementation, a sub-class might also want to redefine
       * hasFields
+      * __getitem__ may not be feasible with some streams
+      * __iter__
     """
 
-    infinity = float("infinity")
-    
     def __init__(self):
         pass
     
@@ -124,7 +125,9 @@
         def __iter__(self): #makes for loop work
             return self
         def next(self):
-            return self.minibatch_iterator.next()[0]
+            size1_minibatch = self.minibatch_iterator.next()
+            return Example(size1_minibatch.keys,[value[0] for value in size1_minibatch.values()])
+        
         def next_index(self):
             return self.minibatch_iterator.next_index()
 
@@ -223,9 +226,6 @@
         a list-like container of the f2 field, etc.
 
         Using the first syntax, all the fields will be returned in "i".
-        Beware that some datasets may not support this syntax, if the number
-        of fields is infinite (i.e. field values may be computed "on demand").
-
         Using the third syntax, i1, i2, i3 will be list-like containers of the
         f1, f2, and f3 fields of a batch of examples on each loop iteration.
 
@@ -277,13 +277,11 @@
     def __len__(self):
         """
         len(dataset) returns the number of examples in the dataset.
-        By default, a DataSet is a 'stream', i.e. it has an unbounded (infinite) length.
+        By default, a DataSet is a 'stream', i.e. it has an unbounded length (raises UnboundedDataSet).
         Sub-classes which implement finite-length datasets should redefine this method.
-        Some methods only make sense for finite-length datasets, and will perform
-           assert len(dataset)<DataSet.infinity
-        in order to check the finiteness of the dataset.
+        Some methods only make sense for finite-length datasets.
         """
-        return infinity
+        raise UnboundedDataSet()
 
     def hasFields(self,*fieldnames):
         """
@@ -327,8 +325,29 @@
         arbitrary slicing/indexing
         because they can only iterate through examples one or a minibatch at a time
         and do not actually store or keep past (or future) examples.
+
+        The default implementation of getitem uses the minibatches iterator
+        to obtain one example, one slice, or a list of examples. It may not
+        always be the most efficient way to obtain the result, especially if
+        the data are actually stored in a memory array.
         """
-        raise NotImplementedError()
+        if type(i) is int:
+            return DataSet.MinibatchToSingleExampleIterator(
+                self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
+        if type(i) is slice:
+            if not i.start: i.start=0
+            if not i.step: i.step=1
+            if i.step is 1:
+                return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
+            rows = range(i.start,i.stop,i.step)
+        else:
+            assert type(i) is list
+            rows = i
+        fields_values = zip(*[self[row] for row in rows])
+        return MinibatchDataSet(
+            Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
+                                        for fieldname,field_values
+                                        in zip(self.fieldNames(),fields_values)]))
 
     def valuesHStack(self,fieldnames,fieldvalues):
         """
@@ -377,9 +396,9 @@
         """
         return HStackedDataSet(self,other)
 
-    def __add__(self,other):
+    def __and__(self,other):
         """
-        dataset1 + dataset2 is a dataset that concatenates the examples from the argument datasets
+        dataset1 & dataset2 is a dataset that concatenates the examples from the argument datasets
         (and whose length is the sum of the length of the argument datasets). This only
         works if they all have the same fields.
         """
@@ -398,7 +417,7 @@
 
 def vstack(datasets):
     """
-    vstack(dataset1,dataset2,...) returns dataset1 + datataset2 + ...
+    vstack(dataset1,dataset2,...) returns dataset1 & datataset2 & ...
     which is a dataset which iterates first over the examples of dataset1, then
     over those of dataset2, etc.
     """
@@ -430,9 +449,15 @@
     be turned back into a DataSet with its examples() method:
       dataset2 = dataset1.fields().examples()
     and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1).
+
+    DataSetFields can be concatenated vertically or horizontally. To be consistent with
+    the syntax used for DataSets, the | concatenates the fields and the & concatenates
+    the examples.
     """
     def __init__(self,dataset,*fieldnames):
         self.dataset=dataset
+        if not fieldnames:
+            fieldnames=dataset.fieldNames()
         assert dataset.hasFields(*fieldnames)
         LookupList.__init__(self,dataset.fieldNames(),
                             dataset.minibatches(fieldnames if len(fieldnames)>0 else self.fieldNames(),
@@ -447,7 +472,7 @@
         """
         return (self.examples() + other.examples()).fields()
 
-    def __add__(self,other):
+    def __and__(self,other):
         """
         fields1 + fields2 is a DataSetFields that whose list of fields is the concatenation
         of the fields of DataSetFields fields1 and fields2.
@@ -479,7 +504,8 @@
         return self.length
 
     def __getitem__(self,i):
-        return Example(self.fields.keys(),[field[i] for field in self.fields])
+        return DataSetFields(MinibatchDataSet(
+            Example(self.fields.keys(),[field[i] for field in self.fields])),self.fields)
 
     def fieldNames(self):
         return self.fields.keys()
@@ -509,7 +535,7 @@
                 self.next_example+=minibatch_size
                 return DataSetFields(MinibatchDataSet(minibatch),fieldnames)
 
-        return MinibatchWrapAroundIterator(self,fieldnames,minibatch_size,n_batches,offset)
+        return Iterator(self)
 
     def valuesVStack(self,fieldname,fieldvalues):
         return self.values_vstack(fieldname,fieldvalues)
@@ -639,8 +665,11 @@
         # We use this map from row index to dataset index for constant-time random access of examples,
         # to avoid having to search for the appropriate dataset each time and slice is asked for.
         for dataset,k in enumerate(datasets[0:-1]):
-            L=len(dataset)
-            assert L<DataSet.infinity
+            try:
+                L=len(dataset)
+            except UnboundedDataSet:
+                print "All VStacked datasets (except possibly the last) must be bounded (have a length)."
+                assert False
             for i in xrange(L):
                 self.index2dataset[self.length+i]=k
             self.datasets_start_row.append(self.length)
@@ -721,14 +750,17 @@
                     while self.n_left_in_mb>0:
                         self.move_to_next_dataset()
                         extra_mb.append(self.next_iterator.next())
-                    mb = Example(names,
-                                 [dataset.valuesVStack(name,[mb[name]]+[b[name] for b in extra_mb])
-                                  for name in fieldnames])
+                    examples = Example(names,
+                                       [dataset.valuesVStack(name,
+                                                             [mb[name]]+[b[name] for b in extra_mb])
+                                            for name in fieldnames])
+                    mb = DataSetFields(MinibatchDataSet(examples),fieldnames)
+                    
                 self.next_row+=minibatch_size
                 self.next_dataset_row+=minibatch_size
                 if self.next_row+minibatch_size>len(dataset):
                     self.move_to_next_dataset()
-                return mb
+                return 
                         
                 
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):