diff dataset.py @ 268:3f1cd8897fda

reverting dataset
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 04 Jun 2008 18:48:50 -0400
parents 6e69fb91f3c0
children fdce496c3b56
line wrap: on
line diff
--- a/dataset.py	Wed Jun 04 17:49:28 2008 -0400
+++ b/dataset.py	Wed Jun 04 18:48:50 2008 -0400
@@ -109,6 +109,10 @@
 
      - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
 
+     - dataset[fieldname] an iterable over the values of the field fieldname across
+     the dataset (the iterable is obtained by default by calling valuesVStack
+     over the values for individual examples).
+
      - dataset.<property> returns the value of a property associated with
      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
@@ -158,14 +162,10 @@
     By convention, attributes not in attributeNames() should have a name
     starting with an underscore.
     @todo enforce/test that convention!
-
     """
 
-    if 0:
-        # removed by James June 4... these aren't used anywhere according to
-        # grep
-        numpy_vstack = lambda fieldname,values: numpy.vstack(values)
-        numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
+    numpy_vstack = lambda fieldname,values: numpy.vstack(values)
+    numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
         
     def __init__(self,description=None,fieldtypes=None):
         if description is None:
@@ -277,11 +277,9 @@
                     # first get the beginning of our minibatch (top of dataset)
                     first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
                     second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
-
-                    blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
-                                         for name in self.fieldnames]
-                    print type(self.dataset), blah
-                    minibatch = Example(self.fieldnames,blah)
+                    minibatch = Example(self.fieldnames,
+                                        [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
+                                         for name in self.fieldnames])
             self.next_row=upper
             self.n_batches_done+=1
             if upper >= self.L and self.n_batches:
@@ -462,16 +460,12 @@
                                             for fieldname,field_values
                                             in zip(self.fieldNames(),fields_values)]),
                 self.valuesVStack,self.valuesHStack)
-
-        raise TypeError(i)
-        if 0: 
-            # else check for a fieldname
-            #after talk with Yoshua June 4, this is disabled.
-            if self.hasFields(i):
-                return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-            # else we are trying to access a property of the dataset
-            assert i in self.__dict__ # else it means we are trying to access a non-existing property
-            return self.__dict__[i]
+        # else check for a fieldname
+        if self.hasFields(i):
+            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
+        # else we are trying to access a property of the dataset
+        assert i in self.__dict__ # else it means we are trying to access a non-existing property
+        return self.__dict__[i]
 
     def valuesHStack(self,fieldnames,fieldvalues):
         """
@@ -497,20 +491,21 @@
 
     def valuesVStack(self,fieldname,values):
         """
-        @param fieldname: the name of the field from which the values were taken
-        @type fieldname: any type
-
-        @param values: bits near the beginning or end of the dataset
-        @type values: list of minibatches (returned by minibatch_nowrap)
-
-        @return: the concatenation (stacking) of the values
-        @rtype: something suitable as a minibatch field
-
+        Return a value that corresponds to concatenating (vertically) several values of the
+        same field. This can be important to build a minibatch out of individual examples. This
+        is likely to involve a copy of the original values. When the values are numpy arrays, the
+        result should be numpy.vstack(values).
+        The default is to use numpy.vstack for numpy.ndarray values, and a list
+        pointing to the original values for other data types.
         """
-        rval = []
-        for sub_batch in values:
-            rval.extend(sub_batch)
-        return rval
+        all_numpy=True
+        for value in values:
+            if not type(value) is numpy.ndarray:
+                all_numpy=False
+        if all_numpy:
+            return numpy.vstack(values)
+        # the default implementation of vertical stacking is to put values in a list
+        return values
 
     def __or__(self,other):
         """
@@ -958,29 +953,16 @@
     Virtual super-class of datasets whose field values are numpy array,
     thus defining valuesHStack and valuesVStack for sub-classes.
     """
-    def __init__(self, description=None, field_types=None):
-        DataSet.__init__(self, description, field_types)
-    def valuesHStack(self, fieldnames, fieldvalues):
+    def __init__(self,description=None,field_types=None):
+        DataSet.__init__(self,description,field_types)
+    def valuesHStack(self,fieldnames,fieldvalues):
         """Concatenate field values horizontally, e.g. two vectors
         become a longer vector, two matrices become a wider matrix, etc."""
         return numpy.hstack(fieldvalues)
-    def valuesVStack(self, fieldname, values):
+    def valuesVStack(self,fieldname,values):
         """Concatenate field values vertically, e.g. two vectors
         become a two-row matrix, two matrices become a longer matrix, etc."""
-        #print len(values)
-        for v in values:
-            if not isinstance(v, numpy.ndarray):
-                raise TypeError(v, type(v))
-
-        s0 = sum([v.shape[0] for v in values])
-        #TODO: there's gotta be a better way to do this!
-        dtype = values[0].dtype
-        rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype)
-        cur_row = 0
-        for v in values:
-            rval[cur_row:cur_row+v.shape[0]] = v
-            cur_row += v.shape[0]
-        return rval
+        return numpy.vstack(values)
 
 class ArrayDataSet(ArrayFieldsDataSet):
     """
@@ -1005,7 +987,7 @@
         for fieldname, fieldcolumns in self.fields_columns.items():
             if type(fieldcolumns) is int:
                 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
-                if 0:
+                if 1:
                     #I changed this because it didn't make sense to me,
                     # and it made it more difficult to write my learner.
                     # If it breaks stuff, let's talk about it.
@@ -1037,7 +1019,7 @@
         values=self.fields_columns.values()
         if type(key) is int:
             return Example(fieldnames,
-                           [numpy.asarray(self.data[key,col]) for col in values])
+                           [self.data[key,col] for col in values])
         if type(key) is slice:
             return MinibatchDataSet(Example(fieldnames,
                                             [self.data[key,col] for col in values]))
@@ -1115,207 +1097,198 @@
 
 
 class CachedDataSet(DataSet):
-    """
-    Wrap a L{DataSet} whose values are computationally expensive to obtain
-    (e.g. because they involve some computation, or disk access),
-    so that repeated accesses to the same example are done cheaply,
-    by caching every example value that has been accessed at least once.
+  """
+  Wrap a L{DataSet} whose values are computationally expensive to obtain
+  (e.g. because they involve some computation, or disk access),
+  so that repeated accesses to the same example are done cheaply,
+  by caching every example value that has been accessed at least once.
 
-    Optionally, for finite-length dataset, all the values can be computed
-    (and cached) upon construction of the CachedDataSet, rather at the
-    first access.
+  Optionally, for finite-length dataset, all the values can be computed
+  (and cached) upon construction of the CachedDataSet, rather at the
+  first access.
 
-    @todo: when cache_all_upon_construction create mini-batches that are as 
-    large as possible but not so large as to fill up memory.
-    
-    @todo: add disk-buffering capability, so that when the cache becomes too
-    big for memory, we cache things on disk, trying to keep in memory only
-    the record most likely to be accessed next.
-    """
-    def __init__(self,source_dataset,cache_all_upon_construction=False):
-        self.source_dataset=source_dataset
-        self.cache_all_upon_construction=cache_all_upon_construction
-        self.cached_examples = [] #a list of LookupList (copies)
-        if cache_all_upon_construction:
-            # this potentially brings all the source examples
-            # into memory at once, which may be too much
-            # the work could possibly be done by minibatches
-            # that are as large as possible but no more than what memory allows.
-            fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
-            assert all([len(self)==len(fval) for fval in fields_values])
-            for example in fields_values.examples():
-                dup = copy.copy(example)
-                self.cached_examples.append(dup)
+  @todo: when cache_all_upon_construction create mini-batches that are as 
+  large as possible but not so large as to fill up memory.
+  
+  @todo: add disk-buffering capability, so that when the cache becomes too
+  big for memory, we cache things on disk, trying to keep in memory only
+  the record most likely to be accessed next.
+  """
+  def __init__(self,source_dataset,cache_all_upon_construction=False):
+      self.source_dataset=source_dataset
+      self.cache_all_upon_construction=cache_all_upon_construction
+      self.cached_examples = []
+      if cache_all_upon_construction:
+          # this potentially brings all the source examples
+          # into memory at once, which may be too much
+          # the work could possibly be done by minibatches
+          # that are as large as possible but no more than what memory allows.
+          fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
+          assert all([len(self)==len(field_values) for field_values in fields_values])
+          for example in fields_values.examples():
+              self.cached_examples.append(copy.copy(example))
 
-        self.fieldNames = source_dataset.fieldNames
-        self.hasFields = source_dataset.hasFields
-        self.valuesHStack = source_dataset.valuesHStack
-        self.valuesVStack = source_dataset.valuesVStack
+      self.fieldNames = source_dataset.fieldNames
+      self.hasFields = source_dataset.hasFields
+      self.valuesHStack = source_dataset.valuesHStack
+      self.valuesVStack = source_dataset.valuesVStack
       
-    def __len__(self):
-        return len(self.source_dataset)
+  def __len__(self):
+      return len(self.source_dataset)
 
-    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-        class CacheIterator(object):
-            def __init__(self,dataset):
-                self.dataset=dataset
-                self.current=offset
-                self.all_fields = self.dataset.fieldNames()==fieldnames
-            def __iter__(self): return self
-            def next(self):
-                upper = self.current+minibatch_size
-                cache_len = len(self.dataset.cached_examples)
-                if upper>cache_len:
-                    # whole minibatch is not already in cache
-                    # cache everything from current length to upper
-                    for example in self.dataset.source_dataset[cache_len:upper]:
-                        self.dataset.cached_examples.append(example)
-
-                next_range = slice(self.current, self.current+minibatch_size)
-                blah = self.dataset.cached_examples[next_range]
-                all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah))
-                self.current+=minibatch_size
-
-                #little optimization to avoid second Example computation if
-                #possible.
-                if self.all_fields:
-                    return all_fields_minibatch
+  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+      class CacheIterator(object):
+          def __init__(self,dataset):
+              self.dataset=dataset
+              self.current=offset
+              self.all_fields = self.dataset.fieldNames()==fieldnames
+          def __iter__(self): return self
+          def next(self):
+              upper = self.current+minibatch_size
+              cache_len = len(self.dataset.cached_examples)
+              if upper>cache_len: # whole minibatch is not already in cache
+                  # cache everything from current length to upper
+                  for example in self.dataset.source_dataset[cache_len:upper]:
+                      self.dataset.cached_examples.append(example)
+              all_fields_minibatch = Example(self.dataset.fieldNames(),
+                                             zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
+              self.current+=minibatch_size
+              if self.all_fields:
+                  return all_fields_minibatch
+              return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
+      return CacheIterator(self)
 
-                rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
-                return rval
-        return CacheIterator(self)
+  def __getitem__(self,i):
+      if type(i)==int and len(self.cached_examples)>i:
+          return self.cached_examples[i]
+      else:
+          return self.source_dataset[i]
+      
+  def __iter__(self):
+      class CacheIteratorIter(object):
+          def __init__(self,dataset):
+              self.dataset=dataset
+              self.l = len(dataset)
+              self.current = 0
+              self.fieldnames = self.dataset.fieldNames()
+              self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+          def __iter__(self): return self
+          def next(self):
+              if self.current>=self.l:
+                  raise StopIteration
+              cache_len = len(self.dataset.cached_examples)
+              if self.current>=cache_len: # whole minibatch is not already in cache
+                  # cache everything from current length to upper
+                  self.dataset.cached_examples.append(
+                      self.dataset.source_dataset[self.current])
+              self.example._values = self.dataset.cached_examples[self.current]
+              self.current+=1
+              return self.example
 
-    def __getitem__(self,i):
-        if type(i)==int and len(self.cached_examples)>i:
-            return self.cached_examples[i]
-        else:
-            return self.source_dataset[i]
-        
-    def __iter__(self):
-        class CacheIteratorIter(object):
-            def __init__(self,dataset):
-                self.dataset=dataset
-                self.l = len(dataset)
-                self.current = 0
-                self.fieldnames = self.dataset.fieldNames()
-                self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
-            def __iter__(self): return self
-            def next(self):
-                if self.current>=self.l:
-                    raise StopIteration
-                cache_len = len(self.dataset.cached_examples)
-                if self.current>=cache_len: # whole minibatch is not already in cache
-                    # cache everything from current length to upper
-                    self.dataset.cached_examples.append(
-                        self.dataset.source_dataset[self.current])
-                self.example._values = self.dataset.cached_examples[self.current]
-                self.current+=1
-                return self.example
-
-        return CacheIteratorIter(self)
+      return CacheIteratorIter(self)
 
 class ApplyFunctionDataSet(DataSet):
-    """
-    A L{DataSet} that contains as fields the results of applying a
-    given function example-wise or minibatch-wise to all the fields of
-    an input dataset.  The output of the function should be an iterable
-    (e.g. a list or a LookupList) over the resulting values.
-    
-    The function take as input the fields of the dataset, not the examples.
+  """
+  A L{DataSet} that contains as fields the results of applying a
+  given function example-wise or minibatch-wise to all the fields of
+  an input dataset.  The output of the function should be an iterable
+  (e.g. a list or a LookupList) over the resulting values.
+  
+  The function take as input the fields of the dataset, not the examples.
 
-    In minibatch mode, the function is expected to work on minibatches
-    (takes a minibatch in input and returns a minibatch in output). More
-    precisely, it means that each element of the input or output list
-    should be iterable and indexable over the individual example values
-    (typically these elements will be numpy arrays). All of the elements
-    in the input and output lists should have the same length, which is
-    the length of the minibatch.
+  In minibatch mode, the function is expected to work on minibatches
+  (takes a minibatch in input and returns a minibatch in output). More
+  precisely, it means that each element of the input or output list
+  should be iterable and indexable over the individual example values
+  (typically these elements will be numpy arrays). All of the elements
+  in the input and output lists should have the same length, which is
+  the length of the minibatch.
 
-    The function is applied each time an example or a minibatch is accessed.
-    To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+  The function is applied each time an example or a minibatch is accessed.
+  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
 
-    If the values_{h,v}stack functions are not provided, then
-    the input_dataset.values{H,V}Stack functions are used by default.
-    """
-    def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
-                 values_hstack=None,values_vstack=None,
-                 description=None,fieldtypes=None):
-        """
-        Constructor takes an input dataset that has as many fields as the function
-        expects as inputs. The resulting dataset has as many fields as the function
-        produces as outputs, and that should correspond to the number of output names
-        (provided in a list).
+  If the values_{h,v}stack functions are not provided, then
+  the input_dataset.values{H,V}Stack functions are used by default.
+  """
+  def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
+               values_hstack=None,values_vstack=None,
+               description=None,fieldtypes=None):
+      """
+      Constructor takes an input dataset that has as many fields as the function
+      expects as inputs. The resulting dataset has as many fields as the function
+      produces as outputs, and that should correspond to the number of output names
+      (provided in a list).
 
-        Note that the expected semantics of the function differs in minibatch mode
-        (it takes minibatches of inputs and produces minibatches of outputs, as
-        documented in the class comment).
+      Note that the expected semantics of the function differs in minibatch mode
+      (it takes minibatches of inputs and produces minibatches of outputs, as
+      documented in the class comment).
 
-        TBM: are filedtypes the old field types (from input_dataset) or the new ones
-        (for the new dataset created)?
-        """
-        self.input_dataset=input_dataset
-        self.function=function
-        self.output_names=output_names
-        self.minibatch_mode=minibatch_mode
-        DataSet.__init__(self,description,fieldtypes)
-        self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
-        self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
+      TBM: are filedtypes the old field types (from input_dataset) or the new ones
+      (for the new dataset created)?
+      """
+      self.input_dataset=input_dataset
+      self.function=function
+      self.output_names=output_names
+      self.minibatch_mode=minibatch_mode
+      DataSet.__init__(self,description,fieldtypes)
+      self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
+      self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
 
-    def __len__(self):
-        return len(self.input_dataset)
+  def __len__(self):
+      return len(self.input_dataset)
 
-    def fieldNames(self):
-        return self.output_names
+  def fieldNames(self):
+      return self.output_names
 
-    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-        class ApplyFunctionIterator(object):
-            def __init__(self,output_dataset):
-                self.input_dataset=output_dataset.input_dataset
-                self.output_dataset=output_dataset
-                self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
-                                                                   n_batches=n_batches,offset=offset).__iter__()
+  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+      class ApplyFunctionIterator(object):
+          def __init__(self,output_dataset):
+              self.input_dataset=output_dataset.input_dataset
+              self.output_dataset=output_dataset
+              self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
+                                                                 n_batches=n_batches,offset=offset).__iter__()
 
-            def __iter__(self): return self
+          def __iter__(self): return self
 
-            def next(self):
-                function_inputs = self.input_iterator.next()
-                all_output_names = self.output_dataset.output_names
-                if self.output_dataset.minibatch_mode:
-                    function_outputs = self.output_dataset.function(*function_inputs)
-                else:
-                    input_examples = zip(*function_inputs)
-                    output_examples = [self.output_dataset.function(*input_example)
-                                       for input_example in input_examples]
-                    function_outputs = [self.output_dataset.valuesVStack(name,values)
-                                        for name,values in zip(all_output_names,
-                                                               zip(*output_examples))]
-                all_outputs = Example(all_output_names,function_outputs)
-                if fieldnames==all_output_names:
-                    return all_outputs
-                return Example(fieldnames,[all_outputs[name] for name in fieldnames])
+          def next(self):
+              function_inputs = self.input_iterator.next()
+              all_output_names = self.output_dataset.output_names
+              if self.output_dataset.minibatch_mode:
+                  function_outputs = self.output_dataset.function(*function_inputs)
+              else:
+                  input_examples = zip(*function_inputs)
+                  output_examples = [self.output_dataset.function(*input_example)
+                                     for input_example in input_examples]
+                  function_outputs = [self.output_dataset.valuesVStack(name,values)
+                                      for name,values in zip(all_output_names,
+                                                             zip(*output_examples))]
+              all_outputs = Example(all_output_names,function_outputs)
+              if fieldnames==all_output_names:
+                  return all_outputs
+              return Example(fieldnames,[all_outputs[name] for name in fieldnames])
 
 
-        return ApplyFunctionIterator(self)
+      return ApplyFunctionIterator(self)
 
-    def __iter__(self): # only implemented for increased efficiency
-        class ApplyFunctionSingleExampleIterator(object):
-            def __init__(self,output_dataset):
-                self.current=0
-                self.output_dataset=output_dataset
-                self.input_iterator=output_dataset.input_dataset.__iter__()
-            def __iter__(self): return self
-            def next(self):
-                if self.output_dataset.minibatch_mode:
-                    function_inputs = [[input] for input in self.input_iterator.next()]
-                    outputs = self.output_dataset.function(*function_inputs)
-                    assert all([hasattr(output,'__iter__') for output in outputs])
-                    function_outputs = [output[0] for output in outputs]
-                else:
-                    function_inputs = self.input_iterator.next()
-                    function_outputs = self.output_dataset.function(*function_inputs)
-                return Example(self.output_dataset.output_names,function_outputs)
-        return ApplyFunctionSingleExampleIterator(self)
-    
+  def __iter__(self): # only implemented for increased efficiency
+      class ApplyFunctionSingleExampleIterator(object):
+          def __init__(self,output_dataset):
+              self.current=0
+              self.output_dataset=output_dataset
+              self.input_iterator=output_dataset.input_dataset.__iter__()
+          def __iter__(self): return self
+          def next(self):
+              if self.output_dataset.minibatch_mode:
+                  function_inputs = [[input] for input in self.input_iterator.next()]
+                  outputs = self.output_dataset.function(*function_inputs)
+                  assert all([hasattr(output,'__iter__') for output in outputs])
+                  function_outputs = [output[0] for output in outputs]
+              else:
+                  function_inputs = self.input_iterator.next()
+                  function_outputs = self.output_dataset.function(*function_inputs)
+              return Example(self.output_dataset.output_names,function_outputs)
+      return ApplyFunctionSingleExampleIterator(self)
+  
 
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """