diff dataset.py @ 266:6e69fb91f3c0

initial commit of amat
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 04 Jun 2008 17:49:09 -0400
parents 19b14afe04b7
children 3f1cd8897fda
line wrap: on
line diff
--- a/dataset.py	Tue Jun 03 21:34:40 2008 -0400
+++ b/dataset.py	Wed Jun 04 17:49:09 2008 -0400
@@ -109,10 +109,6 @@
 
      - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
 
-     - dataset[fieldname] an iterable over the values of the field fieldname across
-     the dataset (the iterable is obtained by default by calling valuesVStack
-     over the values for individual examples).
-
      - dataset.<property> returns the value of a property associated with
      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
@@ -162,10 +158,14 @@
     By convention, attributes not in attributeNames() should have a name
     starting with an underscore.
     @todo enforce/test that convention!
+
     """
 
-    numpy_vstack = lambda fieldname,values: numpy.vstack(values)
-    numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
+    if 0:
+        # removed by James June 4... these aren't used anywhere according to
+        # grep
+        numpy_vstack = lambda fieldname,values: numpy.vstack(values)
+        numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
         
     def __init__(self,description=None,fieldtypes=None):
         if description is None:
@@ -277,9 +277,11 @@
                     # first get the beginning of our minibatch (top of dataset)
                     first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
                     second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
-                    minibatch = Example(self.fieldnames,
-                                        [self.dataset.valuesAppend(name,[first_part[name],second_part[name]])
-                                         for name in self.fieldnames])
+
+                    blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
+                                         for name in self.fieldnames]
+                    print type(self.dataset), blah
+                    minibatch = Example(self.fieldnames,blah)
             self.next_row=upper
             self.n_batches_done+=1
             if upper >= self.L and self.n_batches:
@@ -460,12 +462,16 @@
                                             for fieldname,field_values
                                             in zip(self.fieldNames(),fields_values)]),
                 self.valuesVStack,self.valuesHStack)
-        # else check for a fieldname
-        if self.hasFields(i):
-            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-        # else we are trying to access a property of the dataset
-        assert i in self.__dict__ # else it means we are trying to access a non-existing property
-        return self.__dict__[i]
+
+        raise TypeError(i)
+        if 0: 
+            # else check for a fieldname
+            #after talk with Yoshua June 4, this is disabled.
+            if self.hasFields(i):
+                return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
+            # else we are trying to access a property of the dataset
+            assert i in self.__dict__ # else it means we are trying to access a non-existing property
+            return self.__dict__[i]
 
     def valuesHStack(self,fieldnames,fieldvalues):
         """
@@ -491,21 +497,20 @@
 
     def valuesVStack(self,fieldname,values):
         """
-        Return a value that corresponds to concatenating (vertically) several values of the
-        same field. This can be important to build a minibatch out of individual examples. This
-        is likely to involve a copy of the original values. When the values are numpy arrays, the
-        result should be numpy.vstack(values).
-        The default is to use numpy.vstack for numpy.ndarray values, and a list
-        pointing to the original values for other data types.
+        @param fieldname: the name of the field from which the values were taken
+        @type fieldname: any type
+
+        @param values: bits near the beginning or end of the dataset
+        @type values: list of minibatches (returned by minibatch_nowrap)
+
+        @return: the concatenation (stacking) of the values
+        @rtype: something suitable as a minibatch field
+
         """
-        all_numpy=True
-        for value in values:
-            if not type(value) is numpy.ndarray:
-                all_numpy=False
-        if all_numpy:
-            return numpy.vstack(values)
-        # the default implementation of vertical stacking is to put values in a list
-        return values
+        rval = []
+        for sub_batch in values:
+            rval.extend(sub_batch)
+        return rval
 
     def __or__(self,other):
         """
@@ -962,11 +967,15 @@
     def valuesVStack(self, fieldname, values):
         """Concatenate field values vertically, e.g. two vectors
         become a two-row matrix, two matrices become a longer matrix, etc."""
-        return numpy.vstack(values)
-    def valuesAppend(self, fieldname, values):
+        #print len(values)
+        for v in values:
+            if not isinstance(v, numpy.ndarray):
+                raise TypeError(v, type(v))
+
         s0 = sum([v.shape[0] for v in values])
         #TODO: there's gotta be a better way to do this!
-        rval = numpy.ndarray([s0] + values[0].shape[1:],dtype=values[0].dtype)
+        dtype = values[0].dtype
+        rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype)
         cur_row = 0
         for v in values:
             rval[cur_row:cur_row+v.shape[0]] = v
@@ -1028,7 +1037,7 @@
         values=self.fields_columns.values()
         if type(key) is int:
             return Example(fieldnames,
-                           [self.data[key,col] for col in values])
+                           [numpy.asarray(self.data[key,col]) for col in values])
         if type(key) is slice:
             return MinibatchDataSet(Example(fieldnames,
                                             [self.data[key,col] for col in values]))
@@ -1106,198 +1115,207 @@
 
 
 class CachedDataSet(DataSet):
-  """
-  Wrap a L{DataSet} whose values are computationally expensive to obtain
-  (e.g. because they involve some computation, or disk access),
-  so that repeated accesses to the same example are done cheaply,
-  by caching every example value that has been accessed at least once.
+    """
+    Wrap a L{DataSet} whose values are computationally expensive to obtain
+    (e.g. because they involve some computation, or disk access),
+    so that repeated accesses to the same example are done cheaply,
+    by caching every example value that has been accessed at least once.
 
-  Optionally, for finite-length dataset, all the values can be computed
-  (and cached) upon construction of the CachedDataSet, rather at the
-  first access.
+    Optionally, for finite-length dataset, all the values can be computed
+    (and cached) upon construction of the CachedDataSet, rather at the
+    first access.
 
-  @todo: when cache_all_upon_construction create mini-batches that are as 
-  large as possible but not so large as to fill up memory.
-  
-  @todo: add disk-buffering capability, so that when the cache becomes too
-  big for memory, we cache things on disk, trying to keep in memory only
-  the record most likely to be accessed next.
-  """
-  def __init__(self,source_dataset,cache_all_upon_construction=False):
-      self.source_dataset=source_dataset
-      self.cache_all_upon_construction=cache_all_upon_construction
-      self.cached_examples = []
-      if cache_all_upon_construction:
-          # this potentially brings all the source examples
-          # into memory at once, which may be too much
-          # the work could possibly be done by minibatches
-          # that are as large as possible but no more than what memory allows.
-          fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
-          assert all([len(self)==len(field_values) for field_values in fields_values])
-          for example in fields_values.examples():
-              self.cached_examples.append(copy.copy(example))
+    @todo: when cache_all_upon_construction create mini-batches that are as 
+    large as possible but not so large as to fill up memory.
+    
+    @todo: add disk-buffering capability, so that when the cache becomes too
+    big for memory, we cache things on disk, trying to keep in memory only
+    the record most likely to be accessed next.
+    """
+    def __init__(self,source_dataset,cache_all_upon_construction=False):
+        self.source_dataset=source_dataset
+        self.cache_all_upon_construction=cache_all_upon_construction
+        self.cached_examples = [] #a list of LookupList (copies)
+        if cache_all_upon_construction:
+            # this potentially brings all the source examples
+            # into memory at once, which may be too much
+            # the work could possibly be done by minibatches
+            # that are as large as possible but no more than what memory allows.
+            fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
+            assert all([len(self)==len(fval) for fval in fields_values])
+            for example in fields_values.examples():
+                dup = copy.copy(example)
+                self.cached_examples.append(dup)
 
-      self.fieldNames = source_dataset.fieldNames
-      self.hasFields = source_dataset.hasFields
-      self.valuesHStack = source_dataset.valuesHStack
-      self.valuesVStack = source_dataset.valuesVStack
+        self.fieldNames = source_dataset.fieldNames
+        self.hasFields = source_dataset.hasFields
+        self.valuesHStack = source_dataset.valuesHStack
+        self.valuesVStack = source_dataset.valuesVStack
       
-  def __len__(self):
-      return len(self.source_dataset)
+    def __len__(self):
+        return len(self.source_dataset)
 
-  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-      class CacheIterator(object):
-          def __init__(self,dataset):
-              self.dataset=dataset
-              self.current=offset
-              self.all_fields = self.dataset.fieldNames()==fieldnames
-          def __iter__(self): return self
-          def next(self):
-              upper = self.current+minibatch_size
-              cache_len = len(self.dataset.cached_examples)
-              if upper>cache_len: # whole minibatch is not already in cache
-                  # cache everything from current length to upper
-                  for example in self.dataset.source_dataset[cache_len:upper]:
-                      self.dataset.cached_examples.append(example)
-              all_fields_minibatch = Example(self.dataset.fieldNames(),
-                                             zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
-              self.current+=minibatch_size
-              if self.all_fields:
-                  return all_fields_minibatch
-              return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
-      return CacheIterator(self)
+    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        class CacheIterator(object):
+            def __init__(self,dataset):
+                self.dataset=dataset
+                self.current=offset
+                self.all_fields = self.dataset.fieldNames()==fieldnames
+            def __iter__(self): return self
+            def next(self):
+                upper = self.current+minibatch_size
+                cache_len = len(self.dataset.cached_examples)
+                if upper>cache_len:
+                    # whole minibatch is not already in cache
+                    # cache everything from current length to upper
+                    for example in self.dataset.source_dataset[cache_len:upper]:
+                        self.dataset.cached_examples.append(example)
+
+                next_range = slice(self.current, self.current+minibatch_size)
+                blah = self.dataset.cached_examples[next_range]
+                all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah))
+                self.current+=minibatch_size
+
+                #little optimization to avoid second Example computation if
+                #possible.
+                if self.all_fields:
+                    return all_fields_minibatch
 
-  def __getitem__(self,i):
-      if type(i)==int and len(self.cached_examples)>i:
-          return self.cached_examples[i]
-      else:
-          return self.source_dataset[i]
-      
-  def __iter__(self):
-      class CacheIteratorIter(object):
-          def __init__(self,dataset):
-              self.dataset=dataset
-              self.l = len(dataset)
-              self.current = 0
-              self.fieldnames = self.dataset.fieldNames()
-              self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
-          def __iter__(self): return self
-          def next(self):
-              if self.current>=self.l:
-                  raise StopIteration
-              cache_len = len(self.dataset.cached_examples)
-              if self.current>=cache_len: # whole minibatch is not already in cache
-                  # cache everything from current length to upper
-                  self.dataset.cached_examples.append(
-                      self.dataset.source_dataset[self.current])
-              self.example._values = self.dataset.cached_examples[self.current]
-              self.current+=1
-              return self.example
+                rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
+                return rval
+        return CacheIterator(self)
 
-      return CacheIteratorIter(self)
+    def __getitem__(self,i):
+        if type(i)==int and len(self.cached_examples)>i:
+            return self.cached_examples[i]
+        else:
+            return self.source_dataset[i]
+        
+    def __iter__(self):
+        class CacheIteratorIter(object):
+            def __init__(self,dataset):
+                self.dataset=dataset
+                self.l = len(dataset)
+                self.current = 0
+                self.fieldnames = self.dataset.fieldNames()
+                self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+            def __iter__(self): return self
+            def next(self):
+                if self.current>=self.l:
+                    raise StopIteration
+                cache_len = len(self.dataset.cached_examples)
+                if self.current>=cache_len: # whole minibatch is not already in cache
+                    # cache everything from current length to upper
+                    self.dataset.cached_examples.append(
+                        self.dataset.source_dataset[self.current])
+                self.example._values = self.dataset.cached_examples[self.current]
+                self.current+=1
+                return self.example
+
+        return CacheIteratorIter(self)
 
 class ApplyFunctionDataSet(DataSet):
-  """
-  A L{DataSet} that contains as fields the results of applying a
-  given function example-wise or minibatch-wise to all the fields of
-  an input dataset.  The output of the function should be an iterable
-  (e.g. a list or a LookupList) over the resulting values.
-  
-  The function take as input the fields of the dataset, not the examples.
+    """
+    A L{DataSet} that contains as fields the results of applying a
+    given function example-wise or minibatch-wise to all the fields of
+    an input dataset.  The output of the function should be an iterable
+    (e.g. a list or a LookupList) over the resulting values.
+    
+    The function take as input the fields of the dataset, not the examples.
 
-  In minibatch mode, the function is expected to work on minibatches
-  (takes a minibatch in input and returns a minibatch in output). More
-  precisely, it means that each element of the input or output list
-  should be iterable and indexable over the individual example values
-  (typically these elements will be numpy arrays). All of the elements
-  in the input and output lists should have the same length, which is
-  the length of the minibatch.
+    In minibatch mode, the function is expected to work on minibatches
+    (takes a minibatch in input and returns a minibatch in output). More
+    precisely, it means that each element of the input or output list
+    should be iterable and indexable over the individual example values
+    (typically these elements will be numpy arrays). All of the elements
+    in the input and output lists should have the same length, which is
+    the length of the minibatch.
 
-  The function is applied each time an example or a minibatch is accessed.
-  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+    The function is applied each time an example or a minibatch is accessed.
+    To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
 
-  If the values_{h,v}stack functions are not provided, then
-  the input_dataset.values{H,V}Stack functions are used by default.
-  """
-  def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
-               values_hstack=None,values_vstack=None,
-               description=None,fieldtypes=None):
-      """
-      Constructor takes an input dataset that has as many fields as the function
-      expects as inputs. The resulting dataset has as many fields as the function
-      produces as outputs, and that should correspond to the number of output names
-      (provided in a list).
+    If the values_{h,v}stack functions are not provided, then
+    the input_dataset.values{H,V}Stack functions are used by default.
+    """
+    def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
+                 values_hstack=None,values_vstack=None,
+                 description=None,fieldtypes=None):
+        """
+        Constructor takes an input dataset that has as many fields as the function
+        expects as inputs. The resulting dataset has as many fields as the function
+        produces as outputs, and that should correspond to the number of output names
+        (provided in a list).
 
-      Note that the expected semantics of the function differs in minibatch mode
-      (it takes minibatches of inputs and produces minibatches of outputs, as
-      documented in the class comment).
+        Note that the expected semantics of the function differs in minibatch mode
+        (it takes minibatches of inputs and produces minibatches of outputs, as
+        documented in the class comment).
 
-      TBM: are filedtypes the old field types (from input_dataset) or the new ones
-      (for the new dataset created)?
-      """
-      self.input_dataset=input_dataset
-      self.function=function
-      self.output_names=output_names
-      self.minibatch_mode=minibatch_mode
-      DataSet.__init__(self,description,fieldtypes)
-      self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
-      self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
+        TBM: are filedtypes the old field types (from input_dataset) or the new ones
+        (for the new dataset created)?
+        """
+        self.input_dataset=input_dataset
+        self.function=function
+        self.output_names=output_names
+        self.minibatch_mode=minibatch_mode
+        DataSet.__init__(self,description,fieldtypes)
+        self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
+        self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
 
-  def __len__(self):
-      return len(self.input_dataset)
+    def __len__(self):
+        return len(self.input_dataset)
 
-  def fieldNames(self):
-      return self.output_names
+    def fieldNames(self):
+        return self.output_names
 
-  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-      class ApplyFunctionIterator(object):
-          def __init__(self,output_dataset):
-              self.input_dataset=output_dataset.input_dataset
-              self.output_dataset=output_dataset
-              self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
-                                                                 n_batches=n_batches,offset=offset).__iter__()
+    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        class ApplyFunctionIterator(object):
+            def __init__(self,output_dataset):
+                self.input_dataset=output_dataset.input_dataset
+                self.output_dataset=output_dataset
+                self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
+                                                                   n_batches=n_batches,offset=offset).__iter__()
 
-          def __iter__(self): return self
+            def __iter__(self): return self
 
-          def next(self):
-              function_inputs = self.input_iterator.next()
-              all_output_names = self.output_dataset.output_names
-              if self.output_dataset.minibatch_mode:
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              else:
-                  input_examples = zip(*function_inputs)
-                  output_examples = [self.output_dataset.function(*input_example)
-                                     for input_example in input_examples]
-                  function_outputs = [self.output_dataset.valuesVStack(name,values)
-                                      for name,values in zip(all_output_names,
-                                                             zip(*output_examples))]
-              all_outputs = Example(all_output_names,function_outputs)
-              if fieldnames==all_output_names:
-                  return all_outputs
-              return Example(fieldnames,[all_outputs[name] for name in fieldnames])
+            def next(self):
+                function_inputs = self.input_iterator.next()
+                all_output_names = self.output_dataset.output_names
+                if self.output_dataset.minibatch_mode:
+                    function_outputs = self.output_dataset.function(*function_inputs)
+                else:
+                    input_examples = zip(*function_inputs)
+                    output_examples = [self.output_dataset.function(*input_example)
+                                       for input_example in input_examples]
+                    function_outputs = [self.output_dataset.valuesVStack(name,values)
+                                        for name,values in zip(all_output_names,
+                                                               zip(*output_examples))]
+                all_outputs = Example(all_output_names,function_outputs)
+                if fieldnames==all_output_names:
+                    return all_outputs
+                return Example(fieldnames,[all_outputs[name] for name in fieldnames])
 
 
-      return ApplyFunctionIterator(self)
+        return ApplyFunctionIterator(self)
 
-  def __iter__(self): # only implemented for increased efficiency
-      class ApplyFunctionSingleExampleIterator(object):
-          def __init__(self,output_dataset):
-              self.current=0
-              self.output_dataset=output_dataset
-              self.input_iterator=output_dataset.input_dataset.__iter__()
-          def __iter__(self): return self
-          def next(self):
-              if self.output_dataset.minibatch_mode:
-                  function_inputs = [[input] for input in self.input_iterator.next()]
-                  outputs = self.output_dataset.function(*function_inputs)
-                  assert all([hasattr(output,'__iter__') for output in outputs])
-                  function_outputs = [output[0] for output in outputs]
-              else:
-                  function_inputs = self.input_iterator.next()
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              return Example(self.output_dataset.output_names,function_outputs)
-      return ApplyFunctionSingleExampleIterator(self)
-  
+    def __iter__(self): # only implemented for increased efficiency
+        class ApplyFunctionSingleExampleIterator(object):
+            def __init__(self,output_dataset):
+                self.current=0
+                self.output_dataset=output_dataset
+                self.input_iterator=output_dataset.input_dataset.__iter__()
+            def __iter__(self): return self
+            def next(self):
+                if self.output_dataset.minibatch_mode:
+                    function_inputs = [[input] for input in self.input_iterator.next()]
+                    outputs = self.output_dataset.function(*function_inputs)
+                    assert all([hasattr(output,'__iter__') for output in outputs])
+                    function_outputs = [output[0] for output in outputs]
+                else:
+                    function_inputs = self.input_iterator.next()
+                    function_outputs = self.output_dataset.function(*function_inputs)
+                return Example(self.output_dataset.output_names,function_outputs)
+        return ApplyFunctionSingleExampleIterator(self)
+    
 
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """