Mercurial > pylearn

--- a/dataset.py	Thu Jun 05 14:16:46 2008 -0400
+++ b/dataset.py	Thu Jun 05 18:38:42 2008 -0400
@@ -1,6 +1,5 @@

-from lookup_list import LookupList
-Example = LookupList
+from lookup_list import LookupList as Example
 from misc import unique_elements_list_intersection
 from string import join
 from sys import maxint
@@ -38,7 +37,6 @@
         else:
             return [self.__getattribute__(name) for name in attribute_names]

-
 class DataSet(AttributesHolder):
     """A virtual base class for datasets.

@@ -234,9 +232,8 @@
             self.n_batches=n_batches
             self.n_batches_done=0
             self.next_row=offset
-            self.offset=offset
             self.L=len(dataset)
-            assert offset+minibatch_size<=self.L
+            self.offset=offset % self.L
             ds_nbatches =  (self.L-self.next_row)/self.minibatch_size
             if n_batches is not None:
                 ds_nbatches = min(n_batches,ds_nbatches)
@@ -244,8 +241,7 @@
                 assert dataset.hasFields(*fieldnames)
             else:
                 self.fieldnames=dataset.fieldNames()
-            self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size,
-                                                            ds_nbatches,self.next_row)
+            self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, ds_nbatches,self.next_row)

         def __iter__(self):
             return self
@@ -318,7 +314,7 @@
         f1, f2, and f3 fields of a batch of examples on each loop iteration.

         The minibatches iterator is expected to return upon each call to next()
-        a DataSetFields object, which is a LookupList (indexed by the field names) whose
+        a DataSetFields object, which is a Example (indexed by the field names) whose
         elements are iterable and indexable over the minibatch examples, and which keeps a pointer to
         a sub-dataset that can be used to iterate over the individual examples
         in the minibatch. Hence a minibatch can be converted back to a regular
@@ -424,52 +420,70 @@

     def __getitem__(self,i):
         """
-        dataset[i] returns the (i+1)-th example of the dataset.
-        dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
-        dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
-        dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
-        dataset['key'] returns a property associated with the given 'key' string.
-        If 'key' is a fieldname, then the VStacked field values (iterable over
-        field values) for that field is returned. Other keys may be supported
-        by different dataset subclasses. The following key names are encouraged:
-          - 'description': a textual description or name for the dataset
-          - '<fieldname>.type': a type name or value for a given <fieldname>
+        @rtype: Example
+        @returns: single or multiple examples

-        Note that some stream datasets may be unable to implement random access, i.e.
-        arbitrary slicing/indexing
-        because they can only iterate through examples one or a minibatch at a time
-        and do not actually store or keep past (or future) examples.
+        @type i: integer or slice or <iterable> of integers
+        @param i:
+            dataset[i] returns the (i+1)-th example of the dataset.
+            dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+            dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+            dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
+
+        @note:
+        Some stream datasets may be unable to implement random access, i.e.
+        arbitrary slicing/indexing because they can only iterate through
+        examples one or a minibatch at a time and do not actually store or keep
+        past (or future) examples.

         The default implementation of getitem uses the minibatches iterator
         to obtain one example, one slice, or a list of examples. It may not
         always be the most efficient way to obtain the result, especially if
         the data are actually stored in a memory array.
         """
-        # check for an index
+
         if type(i) is int:
-            return DataSet.MinibatchToSingleExampleIterator(
-                self.minibatches(minibatch_size=1,n_batches=1,offset=i)).next()
-        rows=None
-        # or a slice
+            #TODO: consider asserting that i >= 0
+            i_batch = self.minibatches_nowrap(self.fieldNames(),
+                    minibatch_size=1, n_batches=1, offset=i % len(self))
+            return DataSet.MinibatchToSingleExampleIterator(i_batch).next()
+
+        #if i is a contiguous slice
+        if type(i) is slice and (i.step in (None, 1)):
+            offset = 0 if i.start is None else i.start
+            upper_bound = len(self) if i.stop is None else i.stop
+            return MinibatchDataSet(self.minibatches_nowrap(self.fieldNames(),
+                    minibatch_size=upper_bound - offset,
+                    n_batches=1,
+                    offset=offset).next())
+
+        # if slice has a step param, convert it to list and handle it with the
+        # list code
         if type(i) is slice:
-            #print 'i=',i
-            if not i.start: i=slice(0,i.stop,i.step)
-            if not i.stop: i=slice(i.start,len(self),i.step)
-            if not i.step: i=slice(i.start,i.stop,1)
-            if i.step is 1:
-                return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples()
-            rows = range(i.start,i.stop,i.step)
-        # or a list of indices
-        elif type(i) is list:
-            rows = i
-        if rows is not None:
-            examples = [self[row] for row in rows]
-            fields_values = zip(*examples)
-            return MinibatchDataSet(
-                Example(self.fieldNames(),[ self.valuesVStack(fieldname,field_values)
-                                            for fieldname,field_values
-                                            in zip(self.fieldNames(),fields_values)]),
-                self.valuesVStack,self.valuesHStack)
+            offset = 0 if i.start is None else i.start
+            upper_bound = len(self) if i.stop is None else i.stop
+            i = list(range(offset, upper_bound, i.step))
+
+        # handle tuples, arrays, lists
+        if hasattr(i, '__getitem__'):
+            for idx in i:
+                #dis-allow nested slices
+                if not isinstance(idx, int):
+                    raise TypeError(idx)
+            # call back into self.__getitem__
+            examples = [self.minibatches_nowrap(self.fieldNames(),
+                    minibatch_size=1, n_batches=1, offset=ii%len(self)).next()
+                    for ii in i]
+            # re-index the fields in each example by field instead of by example
+            field_values = [[] for blah in  self.fieldNames()]
+            for e in examples:
+                for f,v in zip(field_values, e):
+                    f.append(v)
+            #build them into a LookupList (a.ka. Example)
+            zz = zip(self.fieldNames(),field_values)
+            vst = [self.valuesVStack(fieldname,field_values) for fieldname,field_values in zz]
+            example = Example(self.fieldNames(), vst)
+            return MinibatchDataSet(example, self.valuesVStack, self.valuesHStack)
         raise TypeError(i, type(i))

     def valuesHStack(self,fieldnames,fieldvalues):
@@ -493,24 +507,21 @@
         # the default implementation of horizontal stacking is to put values in a list
         return fieldvalues

-
     def valuesVStack(self,fieldname,values):
         """
-        Return a value that corresponds to concatenating (vertically) several values of the
-        same field. This can be important to build a minibatch out of individual examples. This
-        is likely to involve a copy of the original values. When the values are numpy arrays, the
-        result should be numpy.vstack(values).
-        The default is to use numpy.vstack for numpy.ndarray values, and a list
-        pointing to the original values for other data types.
+        @param fieldname: the name of the field from which the values were taken
+        @type fieldname: any type
+
+        @param values: bits near the beginning or end of the dataset
+        @type values: list of minibatches (returned by minibatch_nowrap)
+
+        @return: the concatenation (stacking) of the values
+        @rtype: something suitable as a minibatch field
         """
-        all_numpy=True
-        for value in values:
-            if not type(value) is numpy.ndarray:
-                all_numpy=False
-        if all_numpy:
-            return numpy.vstack(values)
-        # the default implementation of vertical stacking is to put values in a list
-        return values
+        rval = []
+        for v in values:
+            rval.extend(v)
+        return rval

     def __or__(self,other):
         """
@@ -586,11 +597,11 @@
     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
         assert self.hasFields(*fieldnames)
         return self.src.minibatches_nowrap(fieldnames,minibatch_size,n_batches,offset)
-    def __getitem__(self,i):
+    def dontuse__getitem__(self,i):
         return FieldsSubsetDataSet(self.src[i],self.fieldnames)


-class DataSetFields(LookupList):
+class DataSetFields(Example):
     """
     Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
     DataSetFields iterates over fields (like columns of a matrix), and can be understood
@@ -628,9 +639,9 @@
         self.dataset=dataset

         if isinstance(dataset,MinibatchDataSet):
-            LookupList.__init__(self,fieldnames,list(dataset._fields))
+            Example.__init__(self,fieldnames,list(dataset._fields))
         elif isinstance(original_dataset,MinibatchDataSet):
-            LookupList.__init__(self,fieldnames,
+            Example.__init__(self,fieldnames,
                                 [original_dataset._fields[field]
                                  for field in fieldnames])
         else:
@@ -638,7 +649,7 @@
                                                      minibatch_size=len(dataset),
                                                      n_batches=1)
             minibatch=minibatch_iterator.next()
-            LookupList.__init__(self,fieldnames,minibatch)
+            Example.__init__(self,fieldnames,minibatch)

     def examples(self):
         return self.dataset
@@ -660,7 +671,7 @@

 class MinibatchDataSet(DataSet):
     """
-    Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
+    Turn a L{Example} of same-length (iterable) fields into an example-iterable dataset.
     Each element of the lookup-list should be an iterable and sliceable, all of the same length.
     """
     def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
@@ -680,14 +691,15 @@
                 print 'len(field) = ', len(field)
                 print 'self._fields.keys() = ', self._fields.keys()
                 print 'field=',field
+                print 'fields_lookuplist=', fields_lookuplist
             assert self.length==len(field)
-        self.values_vstack=values_vstack
-        self.values_hstack=values_hstack
+        self.valuesVStack=values_vstack
+        self.valuesHStack=values_hstack

     def __len__(self):
         return self.length

-    def __getitem__(self,i):
+    def dontuse__getitem__(self,i):
         if type(i) in (slice,list):
             return DataSetFields(MinibatchDataSet(
                 Example(self._fields.keys(),[field[i] for field in self._fields])),self.fieldNames())
@@ -717,7 +729,7 @@

                 self.ds=ds
                 self.next_example=offset
-                assert minibatch_size > 0
+                assert minibatch_size >= 0
                 if offset+minibatch_size > ds.length:
                     raise NotImplementedError()
             def __iter__(self):
@@ -741,12 +753,6 @@
         # tbm: added fieldnames to handle subset of fieldnames
         return Iterator(self,fieldnames)

-    def valuesVStack(self,fieldname,fieldvalues):
-        return self.values_vstack(fieldname,fieldvalues)
-
-    def valuesHStack(self,fieldnames,fieldvalues):
-        return self.values_hstack(fieldnames,fieldvalues)
-
 class HStackedDataSet(DataSet):
     """
     A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
@@ -810,7 +816,7 @@
                 return self
             def next(self):
                 # concatenate all the fields of the minibatches
-                l=LookupList()
+                l=Example()
                 for iter in self.iterators:
                     l.append_lookuplist(iter.next())
                 return l
@@ -834,10 +840,10 @@
         return HStackedIterator(self,iterators)


-    def valuesVStack(self,fieldname,fieldvalues):
+    def untested_valuesVStack(self,fieldname,fieldvalues):
         return self.datasets[self.fieldname2dataset[fieldname]].valuesVStack(fieldname,fieldvalues)

-    def valuesHStack(self,fieldnames,fieldvalues):
+    def untested_valuesHStack(self,fieldnames,fieldvalues):
         """
         We will use the sub-dataset associated with the first fieldname in the fieldnames list
         to do the work, hoping that it can cope with the other values (i.e. won't care
@@ -961,11 +967,11 @@
     """
     def __init__(self,description=None,field_types=None):
         DataSet.__init__(self,description,field_types)
-    def valuesHStack(self,fieldnames,fieldvalues):
+    def untested_valuesHStack(self,fieldnames,fieldvalues):
         """Concatenate field values horizontally, e.g. two vectors
         become a longer vector, two matrices become a wider matrix, etc."""
         return numpy.hstack(fieldvalues)
-    def valuesVStack(self,fieldname,values):
+    def untested_valuesVStack(self,fieldname,values):
         """Concatenate field values vertically, e.g. two vectors
         become a two-row matrix, two matrices become a longer matrix, etc."""
         return numpy.vstack(values)
@@ -1019,7 +1025,7 @@
     def __len__(self):
         return len(self.data)

-    def __getitem__(self,key):
+    def dontuse__getitem__(self,key):
         """More efficient implementation than the default __getitem__"""
         fieldnames=self.fields_columns.keys()
         values=self.fields_columns.values()
@@ -1051,12 +1057,12 @@
         assert key in self.__dict__ # else it means we are trying to access a non-existing property
         return self.__dict__[key]

-    def __iter__(self):
+    def dontuse__iter__(self):
         class ArrayDataSetIteratorIter(object):
             def __init__(self,dataset,fieldnames):
                 if fieldnames is None: fieldnames = dataset.fieldNames()
                 # store the resulting minibatch in a lookup-list of values
-                self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
+                self.minibatch = Example(fieldnames,[0]*len(fieldnames))
                 self.dataset=dataset
                 self.current=0
                 self.columns = [self.dataset.fields_columns[f]
@@ -1078,26 +1084,17 @@
         return ArrayDataSetIteratorIter(self,self.fieldNames())

     def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-        class ArrayDataSetIterator(object):
-            def __init__(self,dataset,fieldnames,minibatch_size,n_batches,offset):
-                if fieldnames is None: fieldnames = dataset.fieldNames()
-                # store the resulting minibatch in a lookup-list of values
-                self.minibatch = LookupList(fieldnames,[0]*len(fieldnames))
-                self.dataset=dataset
-                self.minibatch_size=minibatch_size
-                assert offset>=0 and offset<len(dataset.data)
-                assert offset+minibatch_size<=len(dataset.data)
-                self.current=offset
-            def __iter__(self):
-                return self
-            def next(self):
-                #@todo: we suppose that MinibatchWrapAroundIterator stop the iterator
-                sub_data =  self.dataset.data[self.current:self.current+self.minibatch_size]
-                self.minibatch._values = [sub_data[:,self.dataset.fields_columns[f]] for f in self.minibatch._names]
-                self.current+=self.minibatch_size
-                return self.minibatch
+        cursor = Example(fieldnames,[0]*len(fieldnames))
+        fieldnames = self.fieldNames() if fieldnames is None else fieldnames
+        for n in xrange(n_batches):
+            if offset == len(self):
+                break
+            sub_data = self.data[offset : offset+minibatch_size]
+            offset += len(sub_data) #can be less than minibatch_size at end
+            cursor._values = [sub_data[:,self.fields_columns[f]] for f in cursor._names]
+            yield cursor

-        return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
+        #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)


 class CachedDataSet(DataSet):
@@ -1162,7 +1159,7 @@
               return Example(fieldnames,[all_fields_minibatch[name] for name in fieldnames])
       return CacheIterator(self)

-  def __getitem__(self,i):
+  def dontuse__getitem__(self,i):
       if type(i)==int and len(self.cached_examples)>i:
           return self.cached_examples[i]
       else:
@@ -1175,7 +1172,7 @@
               self.l = len(dataset)
               self.current = 0
               self.fieldnames = self.dataset.fieldNames()
-              self.example = LookupList(self.fieldnames,[0]*len(self.fieldnames))
+              self.example = Example(self.fieldnames,[0]*len(self.fieldnames))
           def __iter__(self): return self
           def next(self):
               if self.current>=self.l:
@@ -1192,107 +1189,99 @@
       return CacheIteratorIter(self)

 class ApplyFunctionDataSet(DataSet):
-  """
-  A L{DataSet} that contains as fields the results of applying a
-  given function example-wise or minibatch-wise to all the fields of
-  an input dataset.  The output of the function should be an iterable
-  (e.g. a list or a LookupList) over the resulting values.
-
-  The function take as input the fields of the dataset, not the examples.
+    """
+    A L{DataSet} that contains as fields the results of applying a
+    given function example-wise or minibatch-wise to all the fields of
+    an input dataset.  The output of the function should be an iterable
+    (e.g. a list or a Example) over the resulting values.
+
+    The function take as input the fields of the dataset, not the examples.

-  In minibatch mode, the function is expected to work on minibatches
-  (takes a minibatch in input and returns a minibatch in output). More
-  precisely, it means that each element of the input or output list
-  should be iterable and indexable over the individual example values
-  (typically these elements will be numpy arrays). All of the elements
-  in the input and output lists should have the same length, which is
-  the length of the minibatch.
+    In minibatch mode, the function is expected to work on minibatches
+    (takes a minibatch in input and returns a minibatch in output). More
+    precisely, it means that each element of the input or output list
+    should be iterable and indexable over the individual example values
+    (typically these elements will be numpy arrays). All of the elements
+    in the input and output lists should have the same length, which is
+    the length of the minibatch.

-  The function is applied each time an example or a minibatch is accessed.
-  To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
+    The function is applied each time an example or a minibatch is accessed.
+    To avoid re-doing computation, wrap this dataset inside a CachedDataSet.

-  If the values_{h,v}stack functions are not provided, then
-  the input_dataset.values{H,V}Stack functions are used by default.
-  """
-  def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
-               values_hstack=None,values_vstack=None,
-               description=None,fieldtypes=None):
-      """
-      Constructor takes an input dataset that has as many fields as the function
-      expects as inputs. The resulting dataset has as many fields as the function
-      produces as outputs, and that should correspond to the number of output names
-      (provided in a list).
+    If the values_{h,v}stack functions are not provided, then
+    the input_dataset.values{H,V}Stack functions are used by default.
+    """
+    def __init__(self,input_dataset,function,output_names,minibatch_mode=True,
+                 values_hstack=None,values_vstack=None,
+                 description=None,fieldtypes=None):
+        """
+        Constructor takes an input dataset that has as many fields as the function
+        expects as inputs. The resulting dataset has as many fields as the function
+        produces as outputs, and that should correspond to the number of output names
+        (provided in a list).

-      Note that the expected semantics of the function differs in minibatch mode
-      (it takes minibatches of inputs and produces minibatches of outputs, as
-      documented in the class comment).
+        Note that the expected semantics of the function differs in minibatch mode
+        (it takes minibatches of inputs and produces minibatches of outputs, as
+        documented in the class comment).

-      TBM: are filedtypes the old field types (from input_dataset) or the new ones
-      (for the new dataset created)?
-      """
-      self.input_dataset=input_dataset
-      self.function=function
-      self.output_names=output_names
-      self.minibatch_mode=minibatch_mode
-      DataSet.__init__(self,description,fieldtypes)
-      self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
-      self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack
-
-  def __len__(self):
-      return len(self.input_dataset)
+        TBM: are filedtypes the old field types (from input_dataset) or the new ones
+        (for the new dataset created)?
+        """
+        self.input_dataset=input_dataset
+        self.function=function
+        self.output_names=output_names
+        self.minibatch_mode=minibatch_mode
+        DataSet.__init__(self,description,fieldtypes)
+        self.valuesHStack = values_hstack if values_hstack else input_dataset.valuesHStack
+        self.valuesVStack = values_vstack if values_vstack else input_dataset.valuesVStack

-  def fieldNames(self):
-      return self.output_names
+    def __len__(self):
+        return len(self.input_dataset)

-  def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
-      class ApplyFunctionIterator(object):
-          def __init__(self,output_dataset):
-              self.input_dataset=output_dataset.input_dataset
-              self.output_dataset=output_dataset
-              self.input_iterator=self.input_dataset.minibatches(minibatch_size=minibatch_size,
-                                                                 n_batches=n_batches,offset=offset).__iter__()
+    def fieldNames(self):
+        return self.output_names

-          def __iter__(self): return self
+    def minibatches_nowrap(self, fieldnames, *args, **kwargs):
+        for fields in self.input_dataset.minibatches_nowrap(fieldnames, *args, **kwargs):

-          def next(self):
-              function_inputs = self.input_iterator.next()
-              all_output_names = self.output_dataset.output_names
-              if self.output_dataset.minibatch_mode:
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              else:
-                  input_examples = zip(*function_inputs)
-                  output_examples = [self.output_dataset.function(*input_example)
-                                     for input_example in input_examples]
-                  function_outputs = [self.output_dataset.valuesVStack(name,values)
-                                      for name,values in zip(all_output_names,
-                                                             zip(*output_examples))]
-              all_outputs = Example(all_output_names,function_outputs)
-              if fieldnames==all_output_names:
-                  return all_outputs
-              return Example(fieldnames,[all_outputs[name] for name in fieldnames])
-
-
-      return ApplyFunctionIterator(self)
+            #function_inputs = self.input_iterator.next()
+            if self.minibatch_mode:
+                function_outputs = self.function(*fields)
+            else:
+                input_examples = zip(*fields)
+                output_examples = [self.function(*input_example)
+                                    for input_example in input_examples]
+                function_outputs = [self.valuesVStack(name,values)
+                                    for name,values in zip(self.output_names,
+                                                           zip(*output_examples))]
+            all_outputs = Example(self.output_names, function_outputs)
+            print fields
+            print all_outputs
+            print '--------'
+            if fieldnames==self.output_names:
+                yield all_outputs
+            else:
+                yield Example(fieldnames,[all_outputs[name] for name in fieldnames])

-  def __iter__(self): # only implemented for increased efficiency
-      class ApplyFunctionSingleExampleIterator(object):
-          def __init__(self,output_dataset):
-              self.current=0
-              self.output_dataset=output_dataset
-              self.input_iterator=output_dataset.input_dataset.__iter__()
-          def __iter__(self): return self
-          def next(self):
-              if self.output_dataset.minibatch_mode:
-                  function_inputs = [[input] for input in self.input_iterator.next()]
-                  outputs = self.output_dataset.function(*function_inputs)
-                  assert all([hasattr(output,'__iter__') for output in outputs])
-                  function_outputs = [output[0] for output in outputs]
-              else:
-                  function_inputs = self.input_iterator.next()
-                  function_outputs = self.output_dataset.function(*function_inputs)
-              return Example(self.output_dataset.output_names,function_outputs)
-      return ApplyFunctionSingleExampleIterator(self)
-
+    def untested__iter__(self): # only implemented for increased efficiency
+        class ApplyFunctionSingleExampleIterator(object):
+            def __init__(self,output_dataset):
+                self.current=0
+                self.output_dataset=output_dataset
+                self.input_iterator=output_dataset.input_dataset.__iter__()
+            def __iter__(self): return self
+            def next(self):
+                if self.output_dataset.minibatch_mode:
+                    function_inputs = [[input] for input in self.input_iterator.next()]
+                    outputs = self.output_dataset.function(*function_inputs)
+                    assert all([hasattr(output,'__iter__') for output in outputs])
+                    function_outputs = [output[0] for output in outputs]
+                else:
+                    function_inputs = self.input_iterator.next()
+                    function_outputs = self.output_dataset.function(*function_inputs)
+                return Example(self.output_dataset.output_names,function_outputs)
+        return ApplyFunctionSingleExampleIterator(self)
+

 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """
--- a/test_dataset.py	Thu Jun 05 14:16:46 2008 -0400
+++ b/test_dataset.py	Thu Jun 05 18:38:42 2008 -0400
@@ -215,19 +215,25 @@
     del x,y,i,id,m

     i=0
-    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=3,offset=4)
+    if 0: #this would trigger the bug mentioned in #25
+        test_offset = len(ds) - 1
+    else:
+        test_offset = 4
+    m=ds.minibatches(['x','y'],n_batches=20,minibatch_size=3,offset=test_offset)
     assert isinstance(m,DataSet.MinibatchWrapAroundIterator)
     for x,y in m:
         assert len(x)==m.minibatch_size
         assert len(y)==m.minibatch_size
         for id in range(m.minibatch_size):
-            assert (numpy.append(x[id],y[id])==array[(i+4)%array.shape[0]]).all()
+            assert (numpy.append(x[id],y[id])==array[(i+test_offset)%array.shape[0]]).all()
             i+=1
     assert i==m.n_batches*m.minibatch_size
     del x,y,i,id

-    #@todo: we can't do minibatch bigger then the size of the dataset???
-    assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
+    if 0:
+        # Let's not put buggy behaviour as the target behaviour in the test
+        # suite. --JSB
+        assert have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array)+1,offset=0)
     assert not have_raised2(ds.minibatches,['x','y'],n_batches=1,minibatch_size=len(array),offset=0)

 def test_ds_iterator(array,iterator1,iterator2,iterator3):
@@ -419,7 +425,7 @@
     print "test_ArrayDataSet"
     a2 = numpy.random.rand(10,4)
     ds = ArrayDataSet(a2,{'x':slice(3),'y':3,'z':[0,2]})###???tuple not tested
-    ds = ArrayDataSet(a2,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+    ds = ArrayDataSet(a2,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
     #assert ds==a? should this work?

     test_all(a2,ds)
@@ -429,7 +435,7 @@
 def test_CachedDataSet():
     print "test_CacheDataSet"
     a = numpy.random.rand(10,4)
-    ds1 = ArrayDataSet(a,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+    ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
     ds2 = CachedDataSet(ds1)
     ds3 = CachedDataSet(ds1,cache_all_upon_construction=True)

@@ -447,9 +453,16 @@
     print "test_ApplyFunctionDataSet"
     a = numpy.random.rand(10,4)
     a2 = a+1
-    ds1 = ArrayDataSet(a,LookupList(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested
+    ds1 = ArrayDataSet(a,Example(['x','y','z'],[slice(3),3,[0,2]]))###???tuple not tested

     ds2 = ApplyFunctionDataSet(ds1,lambda x,y,z: (x+1,y+1,z+1), ['x','y','z'],minibatch_mode=False)
+
+    print ds1.fields('x', 'y', 'z')
+    print '   '
+    print ds2.fields('x', 'y', 'z')
+    print '-----------  '
+
+
     ds3 = ApplyFunctionDataSet(ds1,lambda x,y,z: (numpy.array(x)+1,numpy.array(y)+1,numpy.array(z)+1),
                                ['x','y','z'],
                                minibatch_mode=True)
@@ -540,10 +553,7 @@


 if __name__=='__main__':
-    test1()
     test_ArrayDataSet()
-    test_CachedDataSet()
-    test_ApplyFunctionDataSet()
-    #test_speed()
-#test pmat.py
+    #test_CachedDataSet()
+    #test_ApplyFunctionDataSet()