changeset 266:6e69fb91f3c0

initial commit of amat
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 04 Jun 2008 17:49:09 -0400
parents 5614b186c5f4
children 4dad41215967
files amat.py dataset.py test_dataset.py
diffstat 3 files changed, 347 insertions(+), 205 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/amat.py	Wed Jun 04 17:49:09 2008 -0400
@@ -0,0 +1,123 @@
+"""load PLearn AMat files"""
+
+import sys, numpy, array
+
+path_MNIST = '/u/bergstrj/pub/data/mnist.amat'
+
+
+class AMat:
+    """DataSource to access a plearn amat file as a periodic unrandomized stream.
+
+    Attributes:
+
+    input -- minibatch of input
+    target -- minibatch of target
+    weight -- minibatch of weight
+    extra -- minitbatch of extra
+
+    all -- the entire data contents of the amat file
+    n_examples -- the number of training examples in the file
+
+    AMat stands for Ascii Matri[x,ces]
+
+    """
+
+    marker_size = '#size:'
+    marker_sizes = '#sizes:'
+    marker_col_names = '#:'
+
+    def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout):
+
+        """Load the amat at <path> into memory.
+        
+        path - str: location of amat file
+        head - int: stop reading after this many data rows
+        update_interval - int: print '.' to ofile every <this many> lines
+        ofile - file: print status, msgs, etc. to this file
+
+        """
+        self.all = None
+        self.input = None
+        self.target = None
+        self.weight = None
+        self.extra = None
+
+        self.header = False
+        self.header_size = None
+        self.header_rows = None
+        self.header_cols = None
+        self.header_sizes = None
+        self.header_col_names = []
+
+        data_started = False
+        data = array.array('d')
+        
+        f = open(path)
+        n_data_lines = 0
+        len_float_line = None
+
+        for i,line in enumerate(f):
+            if n_data_lines == head:
+                #we've read enough data, 
+                # break even if there's more in the file
+                break
+            if len(line) == 0 or line == '\n':
+                continue
+            if line[0] == '#':
+                if not data_started:
+                    #the condition means that the file has a header, and we're on 
+                    # some header line
+                    self.header = True
+                    if line.startswith(AMat.marker_size):
+                        info = line[len(AMat.marker_size):]
+                        self.header_size = [int(s) for s in info.split()]
+                        self.header_rows, self.header_cols = self.header_size
+                    if line.startswith(AMat.marker_col_names):
+                        info = line[len(AMat.marker_col_names):]
+                        self.header_col_names = info.split()
+                    elif line.startswith(AMat.marker_sizes):
+                        info = line[len(AMat.marker_sizes):]
+                        self.header_sizes = [int(s) for s in info.split()]
+            else:
+                #the first non-commented line tells us that the header is done
+                data_started = True
+                float_line = [float(s) for s in line.split()]
+                if len_float_line is None:
+                    len_float_line = len(float_line)
+                    if (self.header_cols is not None) \
+                            and self.header_cols != len_float_line:
+                        print >> sys.stderr, \
+                                'WARNING: header declared %i cols but first line has %i, using %i',\
+                                self.header_cols, len_float_line, len_float_line
+                else:
+                    if len_float_line != len(float_line):
+                        raise IOError('wrong line length', i, line)
+                data.extend(float_line)
+                n_data_lines += 1
+
+                if update_interval > 0 and (ofile is not None) \
+                        and n_data_lines % update_interval == 0:
+                    ofile.write('.')
+                    ofile.flush()
+
+        if update_interval > 0:
+            ofile.write('\n')
+        f.close()
+
+        # convert from array.array to numpy.ndarray
+        nshape = (len(data) / len_float_line, len_float_line)
+        self.all = numpy.frombuffer(data).reshape(nshape)
+        self.n_examples = self.all.shape[0]
+
+        # assign
+        if self.header_sizes is not None:
+            if len(self.header_sizes) > 4:
+                print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path
+            leftmost = 0
+            #here we make use of the fact that if header_sizes has len < 4
+            # the loop will exit before 4 iterations
+            attrlist = ['input', 'target', 'weight', 'extra']
+            for attr, ncols in zip(attrlist, self.header_sizes): 
+                setattr(self, attr, self.all[:, leftmost:leftmost+ncols])
+                leftmost += ncols
+
--- a/dataset.py	Tue Jun 03 21:34:40 2008 -0400
+++ b/dataset.py	Wed Jun 04 17:49:09 2008 -0400
@@ -109,10 +109,6 @@
 
      - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
 
-     - dataset[fieldname] an iterable over the values of the field fieldname across
-     the dataset (the iterable is obtained by default by calling valuesVStack
-     over the values for individual examples).
-
      - dataset.<property> returns the value of a property associated with
      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
@@ -162,10 +158,14 @@
     By convention, attributes not in attributeNames() should have a name
     starting with an underscore.
     @todo enforce/test that convention!
+
     """
 
-    numpy_vstack = lambda fieldname,values: numpy.vstack(values)
-    numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
+    if 0:
+        # removed by James June 4... these aren't used anywhere according to
+        # grep
+        numpy_vstack = lambda fieldname,values: numpy.vstack(values)
+        numpy_hstack = lambda fieldnames,values: numpy.hstack(values)
         
     def __init__(self,description=None,fieldtypes=None):
         if description is None:
@@ -277,9 +277,11 @@
                     # first get the beginning of our minibatch (top of dataset)
                     first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
                     second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
-                    minibatch = Example(self.fieldnames,
-                                        [self.dataset.valuesAppend(name,[first_part[name],second_part[name]])
-                                         for name in self.fieldnames])
+
+                    blah = [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
+                                         for name in self.fieldnames]
+                    print type(self.dataset), blah
+                    minibatch = Example(self.fieldnames,blah)
             self.next_row=upper
             self.n_batches_done+=1
             if upper >= self.L and self.n_batches:
@@ -460,12 +462,16 @@
                                             for fieldname,field_values
                                             in zip(self.fieldNames(),fields_values)]),
                 self.valuesVStack,self.valuesHStack)
-        # else check for a fieldname
-        if self.hasFields(i):
-            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-        # else we are trying to access a property of the dataset
-        assert i in self.__dict__ # else it means we are trying to access a non-existing property
-        return self.__dict__[i]
+
+        raise TypeError(i)
+        if 0: 
+            # else check for a fieldname
+            #after talk with Yoshua June 4, this is disabled.
+            if self.hasFields(i):
+                return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
+            # else we are trying to access a property of the dataset
+            assert i in self.__dict__ # else it means we are trying to access a non-existing property
+            return self.__dict__[i]
 
     def valuesHStack(self,fieldnames,fieldvalues):
         """
@@ -491,21 +497,20 @@
 
     def valuesVStack(self,fieldname,values):
         """
-        Return a value that corresponds to concatenating (vertically) several values of the
-        same field. This can be important to build a minibatch out of individual examples. This
-        is likely to involve a copy of the original values. When the values are numpy arrays, the
-        result should be numpy.vstack(values).
-        The default is to use numpy.vstack for numpy.ndarray values, and a list
-        pointing to the original values for other data types.
+        @param fieldname: the name of the field from which the values were taken
+        @type fieldname: any type
+
+        @param values: bits near the beginning or end of the dataset
+        @type values: list of minibatches (returned by minibatch_nowrap)
+
+        @return: the concatenation (stacking) of the values
+        @rtype: something suitable as a minibatch field
+
         """
-        all_numpy=True
-        for value in values:
-            if not type(value) is numpy.ndarray:
-                all_numpy=False
-        if all_numpy:
-            return numpy.vstack(values)
-        # the default implementation of vertical stacking is to put values in a list
-        return values
+        rval = []
+        for sub_batch in values:
+            rval.extend(sub_batch)
+        return rval
 
     def __or__(self,other):
         """
@@ -962,11 +967,15 @@
     def valuesVStack(self, fieldname, values):
         """Concatenate field values vertically, e.g. two vectors
         become a two-row matrix, two matrices become a longer matrix, etc."""
-        return numpy.vstack(values)
-    def valuesAppend(self, fieldname, values):
+        #print len(values)
+        for v in values:
+            if not isinstance(v, numpy.ndarray):
+                raise TypeError(v, type(v))
+
         s0 = sum([v.shape[0] for v in values])
         #TODO: there's gotta be a better way to do this!
-        rval = numpy.ndarray([s0] + values[0].shape[1:],dtype=values[0].dtype)
+        dtype = values[0].dtype
+        rval = numpy.ndarray([s0] + list(values[0].shape[1:]), dtype=dtype)
         cur_row = 0
         for v in values:
             rval[cur_row:cur_row+v.shape[0]] = v
@@ -1028,7 +1037,7 @@
         values=self.fields_columns.values()
         if type(key) is int:
             return Example(fieldnames,
-                           [self.data[key,col] for col in values])
+                           [numpy.asarray(self.data[key,col]) for col in values])
         if type(key) is slice:
             return MinibatchDataSet(Example(fieldnames,
                                             [self.data[key,col] for col in values]))
@@ -1106,198 +1115,207 @@
 
 
 class CachedDataSet(DataSet):
-  """
-  Wrap a L{DataSet} whose values are computationally expensive to obtain
-  (e.g. because they involve some computation, or disk access),
-  so that repeated accesses to the same example are done cheaply,
-  by caching every example value that has been accessed at least once.
+    """
+    Wrap a L{DataSet} whose values are computationally expensive to obtain
+    (e.g. because they involve some computation, or disk access),
+    so that repeated accesses to the same example are done cheaply,
+    by caching every example value that has been accessed at least once.
 
-  Optionally, for finite-length dataset, all the values can be computed
-  (and cached) upon construction of the CachedDataSet, rather at the
-  first access.
+    Optionally, for finite-length dataset, all the values can be computed
+    (and cached) upon construction of the CachedDataSet, rather at the
+    first access.
 
-  @todo: when cache_all_upon_construction create mini-batches that are as 
-  large as possible but not so large as to fill up memory.
-  
-  @todo: add disk-buffering capability, so that when the cache becomes too
-  big for memory, we cache things on disk, trying to keep in memory only
-  the record most likely to be accessed next.
-  """
-  def __init__(self,source_dataset,cache_all_upon_construction=False):
-      self.source_dataset=source_dataset
-      self.cache_all_upon_construction=cache_all_upon_construction
-      self.cached_examples = []
-      if cache_all_upon_construction:
-          # this potentially brings all the source examples
-          # into memory at once, which may be too much
-          # the work could possibly be done by minibatches
-          # that are as large as possible but no more than what memory allows.
-          fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
-          assert all([len(self)==len(field_values) for field_values in fields_values])
-          for example in fields_values.examples():
-              self.cached_examples.append(copy.copy(example))
+    @todo: when cache_all_upon_construction create mini-batches that are as 
+    large as possible but not so large as to fill up memory.
+    
+    @todo: add disk-buffering capability, so that when the cache becomes too
+    big for memory, we cache things on disk, trying to keep in memory only
+    the record most likely to be accessed next.
+    """
+    def __init__(self,source_dataset,cache_all_upon_construction=False):
+        self.source_dataset=source_dataset
+        self.cache_all_upon_construction=cache_all_upon_construction
+        self.cached_examples = [] #a list of LookupList (copies)
+        if cache_all_upon_construction:
+            # this potentially brings all the source examples
+            # into memory at once, which may be too much
+            # the work could possibly be done by minibatches
+            # that are as large as possible but no more than what memory allows.
+            fields_values = source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next()
+            assert all([len(self)==len(fval) for fval in fields_values])
+            for example in fields_values.examples():
+                dup = copy.copy(example)
+                self.cached_examples.append(dup)
 
-      self.fieldNames = source_dataset.fieldNames
-      self.hasFields = source_dataset.hasFields
-      self.valuesHStack = source_dataset.valuesHStack
-      self.valuesVStack = source_dataset.valuesVStack
+        self.fieldNames = source_dataset.fieldNames
+        self.hasFields = source_dataset.hasFields
+        self.valuesHStack = source_dataset.valuesHStack
+        self.valuesVStack = source_dataset.valuesVStack
       
-  def __len__(self):
-      return len(self.source_dataset)
+    def __len__(self):
+        return len(self.source_dataset)
 
-  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-      class CacheIterator(object):
-          def __init__(self,dataset):
-              self.dataset=dataset
-              self.current=offset
-              self.all_fields = self.dataset.fieldNames()==fieldnames
-          def __iter__(self): return self
-          def next(self):
-              upper = self.current+minibatch_size
-              cache_len = len(self.dataset.cached_examples)
-              if upper>cache_len: # whole minibatch is not already in cache
-                  # cache everything from current length to upper
-                  for example in self.dataset.source_dataset[cache_len:upper]:
-                      self.dataset.cached_examples.append(example)
-              all_fields_minibatch = Example(self.dataset.fieldNames(),
-                                             zip(*self.dataset.cached_examples[self.current:self.current+minibatch_size]))
-              self.current+=minibatch_size
-              if self.all_fields:
-                  return all_fields_minibatch
-              return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
-      return CacheIterator(self)
+    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        class CacheIterator(object):
+            def __init__(self,dataset):
+                self.dataset=dataset
+                self.current=offset
+                self.all_fields = self.dataset.fieldNames()==fieldnames
+            def __iter__(self): return self
+            def next(self):
+                upper = self.current+minibatch_size
+                cache_len = len(self.dataset.cached_examples)
+                if upper>cache_len:
+                    # whole minibatch is not already in cache
+                    # cache everything from current length to upper
+                    for example in self.dataset.source_dataset[cache_len:upper]:
+                        self.dataset.cached_examples.append(example)
+
+                next_range = slice(self.current, self.current+minibatch_size)
+                blah = self.dataset.cached_examples[next_range]
+                all_fields_minibatch = Example(self.dataset.fieldNames(), zip(*blah))
+                self.current+=minibatch_size
+
+                #little optimization to avoid second Example computation if
+                #possible.
+                if self.all_fields:
+                    return all_fields_minibatch
 
-  def __getitem__(self,i):
-      if type(i)==int and len(self.cached_examples)>i:
-          return self.cached_examples[i]
-      else:
-          return self.source_dataset[i]
-      
-  def __iter__(self):
-      class CacheIteratorIter(object):
-          def __init__(self,dataset):
-              self.dataset=dataset
-              self.l = len(dataset)
-              self.current = 0
-              self.fieldnames = self.dataset.fieldNames()
-              self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
-          def __iter__(self): return self
-          def next(self):
-              if self.current>=self.l:
-                  raise StopIteration
-              cache_len = len(self.dataset.cached_examples)
-              if self.current>=cache_len: # whole minibatch is not already in cache
-                  # cache everything from current length to upper
-                  self.dataset.cached_examples.append(
-                      self.dataset.source_dataset[self.current])
-              self.example._values = self.dataset.cached_examples[self.current]
-              self.current+=1
-              return self.example
+                rval = Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
+                return rval
+        return CacheIterator(self)
 
-      return CacheIteratorIter(self)
+    def __getitem__(self,i):
+        if type(i)==int and len(self.cached_examples)>i:
+            return self.cached_examples[i]
+        else:
+            return self.source_dataset[i]
+        
+    def __iter__(self):
+        class CacheIteratorIter(object):
+            def __init__(self,dataset):
+                self.dataset=dataset
+                self.l = len(dataset)
+                self.current = 0
+                self.fieldnames = self.dataset.fieldNames()
+                self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+            def __iter__(self): return self
+            def next(self):
+                if self.current>=self.l:
+                    raise StopIteration
+                cache_len = len(self.dataset.cached_examples)
+                if self.current>=cache_len: # whole minibatch is not already in cache
+                    # cache everything from current length to upper
+                    self.dataset.cached_examples.append(
+                        self.dataset.source_dataset[self.current])
+                self.example._values = self.dataset.cached_examples[self.current]
+                self.current+=1
+                return self.example
+
+        return CacheIteratorIter(self)
 
 class ApplyFunctionDataSet(DataSet):
-  """
-  A L{DataSet} that contains as fields the results of applying a
-  given function example-wise or minibatch-wise to all the fields of
-  an input dataset.  The output of the function should be an iterable
-  (e.g. a list or a LookupList) over the resulting values.
-  
-  The function take as input the fields of the dataset, not the examples.
+    """
+    A L{DataSet} that contains as fields the results of applying a
+    given function example-wise or minibatch-wise to all the fields of
+    an input dataset.  The output of the function should be an iterable
+    (e.g. a list or a LookupList) over the resulting values.
+    
+    The function take as input the fields of the dataset, not the examples.
 
-  In minibatch mode, the function is expected to work on minibatches
-  (takes a minibatch in input and returns a minibatch in output). More
-  precisely, it means that each element of the input or output list
-  should be iterable and indexable over the individual example values
-  (typically these elements will be numpy arrays). All of the elements
-  in the input and output lists should have the same length, which is
-  the length of the minibatch.
+    In minibatch mode, the function is expected to work on minibatches
+    (takes a minibatch in input and returns a minibatch in output). More
+    precisely, it means that each element of the input or output list
+    should be iterable and indexable over the individual example values
+    (typically these elements will be numpy arrays). All of the elements
+    in the input and output lists should have the same length, which is
+    the length of the minibatch.
 
-  The function is applied each time an example or a minibatch is accessed.
-  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+    The function is applied each time an example or a minibatch is accessed.
+    To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
 
-  If the values_{h,v}stack functions are not provided, then
-  the input_dataset.values{H,V}Stack functions are used by default.
-  """
-  def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
-               values_hstack=None,values_vstack=None,
-               description=None,fieldtypes=None):
-      """
-      Constructor takes an input dataset that has as many fields as the function
-      expects as inputs. The resulting dataset has as many fields as the function
-      produces as outputs, and that should correspond to the number of output names
-      (provided in a list).
+    If the values_{h,v}stack functions are not provided, then
+    the input_dataset.values{H,V}Stack functions are used by default.
+    """
+    def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
+                 values_hstack=None,values_vstack=None,
+                 description=None,fieldtypes=None):
+        """
+        Constructor takes an input dataset that has as many fields as the function
+        expects as inputs. The resulting dataset has as many fields as the function
+        produces as outputs, and that should correspond to the number of output names
+        (provided in a list).
 
-      Note that the expected semantics of the function differs in minibatch mode
-      (it takes minibatches of inputs and produces minibatches of outputs, as
-      documented in the class comment).
+        Note that the expected semantics of the function differs in minibatch mode
+        (it takes minibatches of inputs and produces minibatches of outputs, as
+        documented in the class comment).
 
-      TBM: are filedtypes the old field types (from input_dataset) or the new ones
-      (for the new dataset created)?
-      """
-      self.input_dataset=input_dataset
-      self.function=function
-      self.output_names=output_names
-      self.minibatch_mode=minibatch_mode
-      DataSet.__init__(self,description,fieldtypes)
-      self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
-      self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
+        TBM: are filedtypes the old field types (from input_dataset) or the new ones
+        (for the new dataset created)?
+        """
+        self.input_dataset=input_dataset
+        self.function=function
+        self.output_names=output_names
+        self.minibatch_mode=minibatch_mode
+        DataSet.__init__(self,description,fieldtypes)
+        self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
+        self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
 
-  def __len__(self):
-      return len(self.input_dataset)
+    def __len__(self):
+        return len(self.input_dataset)
 
-  def fieldNames(self):
-      return self.output_names
+    def fieldNames(self):
+        return self.output_names
 
-  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-      class ApplyFunctionIterator(object):
-          def __init__(self,output_dataset):
-              self.input_dataset=output_dataset.input_dataset
-              self.output_dataset=output_dataset
-              self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
-                                                                 n_batches=n_batches,offset=offset).__iter__()
+    def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
+        class ApplyFunctionIterator(object):
+            def __init__(self,output_dataset):
+                self.input_dataset=output_dataset.input_dataset
+                self.output_dataset=output_dataset
+                self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
+                                                                   n_batches=n_batches,offset=offset).__iter__()
 
-          def __iter__(self): return self
+            def __iter__(self): return self
 
-          def next(self):
-              function_inputs = self.input_iterator.next()
-              all_output_names = self.output_dataset.output_names
-              if self.output_dataset.minibatch_mode:
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              else:
-                  input_examples = zip(*function_inputs)
-                  output_examples = [self.output_dataset.function(*input_example)
-                                     for input_example in input_examples]
-                  function_outputs = [self.output_dataset.valuesVStack(name,values)
-                                      for name,values in zip(all_output_names,
-                                                             zip(*output_examples))]
-              all_outputs = Example(all_output_names,function_outputs)
-              if fieldnames==all_output_names:
-                  return all_outputs
-              return Example(fieldnames,[all_outputs[name] for name in fieldnames])
+            def next(self):
+                function_inputs = self.input_iterator.next()
+                all_output_names = self.output_dataset.output_names
+                if self.output_dataset.minibatch_mode:
+                    function_outputs = self.output_dataset.function(*function_inputs)
+                else:
+                    input_examples = zip(*function_inputs)
+                    output_examples = [self.output_dataset.function(*input_example)
+                                       for input_example in input_examples]
+                    function_outputs = [self.output_dataset.valuesVStack(name,values)
+                                        for name,values in zip(all_output_names,
+                                                               zip(*output_examples))]
+                all_outputs = Example(all_output_names,function_outputs)
+                if fieldnames==all_output_names:
+                    return all_outputs
+                return Example(fieldnames,[all_outputs[name] for name in fieldnames])
 
 
-      return ApplyFunctionIterator(self)
+        return ApplyFunctionIterator(self)
 
-  def __iter__(self): # only implemented for increased efficiency
-      class ApplyFunctionSingleExampleIterator(object):
-          def __init__(self,output_dataset):
-              self.current=0
-              self.output_dataset=output_dataset
-              self.input_iterator=output_dataset.input_dataset.__iter__()
-          def __iter__(self): return self
-          def next(self):
-              if self.output_dataset.minibatch_mode:
-                  function_inputs = [[input] for input in self.input_iterator.next()]
-                  outputs = self.output_dataset.function(*function_inputs)
-                  assert all([hasattr(output,'__iter__') for output in outputs])
-                  function_outputs = [output[0] for output in outputs]
-              else:
-                  function_inputs = self.input_iterator.next()
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              return Example(self.output_dataset.output_names,function_outputs)
-      return ApplyFunctionSingleExampleIterator(self)
-  
+    def __iter__(self): # only implemented for increased efficiency
+        class ApplyFunctionSingleExampleIterator(object):
+            def __init__(self,output_dataset):
+                self.current=0
+                self.output_dataset=output_dataset
+                self.input_iterator=output_dataset.input_dataset.__iter__()
+            def __iter__(self): return self
+            def next(self):
+                if self.output_dataset.minibatch_mode:
+                    function_inputs = [[input] for input in self.input_iterator.next()]
+                    outputs = self.output_dataset.function(*function_inputs)
+                    assert all([hasattr(output,'__iter__') for output in outputs])
+                    function_outputs = [output[0] for output in outputs]
+                else:
+                    function_inputs = self.input_iterator.next()
+                    function_outputs = self.output_dataset.function(*function_inputs)
+                return Example(self.output_dataset.output_names,function_outputs)
+        return ApplyFunctionSingleExampleIterator(self)
+    
 
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """
--- a/test_dataset.py	Tue Jun 03 21:34:40 2008 -0400
+++ b/test_dataset.py	Wed Jun 04 17:49:09 2008 -0400
@@ -421,7 +421,7 @@
 
     test_all(a2,ds)
 
-    del a2, ds
+    del a2, ds #removes from list of active objects in debugger
 
 def test_LookupList():
     #test only the example in the doc???
@@ -642,7 +642,8 @@
 
 
 if __name__=='__main__':
-    test1()
+    if 0:
+        test1()
     test_LookupList()
     test_ArrayDataSet()
     test_CachedDataSet()