pylearn: dataset.py comparison

comparison dataset.py @ 17:759d17112b23

more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests

author	bergstrj@iro.umontreal.ca
date	Wed, 26 Mar 2008 21:05:14 -0400
parents	813723310d75 ff4e551490f1
children	57f4015e2e09

comparison

equal deleted inserted replaced

-:813723310d75
+:759d17112b23
-class Example(object):
+from lookup_list import LookupList
-"""
+Example = LookupList
-An example is something that is like a tuple but whose elements can be named, to that
-following syntactic constructions work as one would expect:
+class AbstractFunction (Exception): """Derived class must override this function"""
-example.x = [1, 2, 3] # set a field
-x, y, z = example
-x = example[0]
-x = example["x"]
-"""
-def __init__(self,names,values):
-assert len(values)==len(names)
-self.__dict__['values']=values
-self.__dict__['fields']={}
-for i in xrange(len(values)):
-self.fields[names[i]]=i
-def __getitem__(self,i):
-if isinstance(i,int):
-return self.values[i]
-else:
-return self.values[self.fields[i]]
-def __setitem__(self,i,value):
-if isinstance(i,int):
-self.values[i]=value
-else:
-self.values[self.fields[i]]=value
-def __getattr__(self,name):
-return self.values[self.fields[name]]
-def __setattr__(self,name,value):
-self.values[self.fields[name]]=value
-def __len__(self):
-return len(self.values)
 class DataSet(object):
 """A virtual base class for datasets.
 A DataSet is a generator of iterators; these iterators can run through the
 examples in a variety of ways.  A DataSet need not necessarily have a finite
 all the fields of DataSet self.  Every field of "i" will give access to
 a the field of a single example.  Fields should be accessible via
 i[identifier], but the derived class is free to accept any type of
 identifier, and add extra functionality to the iterator.
 """
-raise NotImplementedError
+for i in self.minibatches( minibatch_size = 1):
+yield Example(i.keys(), [v[0] for v in i.values()])
 def zip(self, *fieldnames):
 """
 Supports two forms of syntax:
 f1, f2, and f3 fields of a single example on each loop iteration.
 The derived class may accept fieldname arguments of any type.
 """
-raise NotImplementedError
+for i in self.minibatches(fieldnames, minibatch_size = 1):
+yield [f[0] for f in i]
-def minibatches(self,minibatch_size,*fieldnames):
+minibatches_fieldnames = None
+minibatches_minibatch_size = 1
+minibatches_n_batches = None
+def minibatches(self,
+fieldnames = minibatches_fieldnames,
+minibatch_size = minibatches_minibatch_size,
+n_batches = minibatches_n_batches):
 """
 Supports two forms of syntax:
-for i in dataset.zip(f1, f2, f3): ...
+for i in dataset.minibatches([f1, f2, f3],**kwargs): ...
-for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...
 Using the first syntax, "i" will be an indexable object, such as a list,
-tuple, or Example instance, such that on every iteration, i[0] is the f1
+tuple, or Example instance, such that on every iteration, i[0] is a
-field of the current example, i[1] is the f2 field, and so on.
+list-like container of the f1 field of a batch current examples, i[1] is
+a list-like container of the f2 field, etc.
-Using the second syntax, i1, i2, i3 will contain the the contents of the
-f1, f2, and f3 fields of a single example on each loop iteration.
+Using the second syntax, i1, i2, i3 will be list-like containers of the
+f1, f2, and f3 fields of a batch of examples on each loop iteration.
-The derived class may accept fieldname arguments of any type.
+PARAMETERS
-Return an iterator, whose next() method returns the next example or the next
+- fieldnames (list of any type, default None):
-minibatch in the dataset. A minibatch (of length > 1) is also an example, but
+The loop variables i1, i2, i3 (in the example above) should contain the
-whose fields should be something one can iterate on again in order to obtain
+f1, f2, and f3 fields of the current batch of examples.  If None, the
-the individual examples.
+derived class can choose a default, e.g. all fields.
-DataSet.zip returns an iterator over only the desired fields, and each field
+- minibatch_size (integer, default 1)
-of the iterator contains one example.
+On every iteration, the variables i1, i2, i3 will have
+exactly minibatch_size elements. e.g. len(i1) == minibatch_size
-Return an iterator which sees only the specified fields (each fieldname is a
-field key, typically a string). The value returned at each iteration
+- n_batches (integer, default None)
-is a tuple with one element per field. Hence it can be used like this:
+The iterator will loop exactly this many times, and then stop.  If None,
-for f1, f2, f3 in dataset.zip('field1','field2','field3'):
+the derived class can choose a default.  If (-1), then the returned
-... use f1, f2, and f3
+iterator should support looping indefinitely.
-If one iterates through minibatches of examples (with the minibatches() method
-or with the minibatch_size argument of the zip() method), then the fields
+Note: A list-like container is something like a tuple, list, numpy.ndarray or
-returned by the iterator's next method should be iterators over the
+any other object that supports integer indexing and slicing.
-individual values within the minibatch (typically these will be arrays
-with minibatch_size rows).
+"""
-Similar to zip but iterate over minibatches.
+raise AbstractFunction()
-Return a minibatch iterator, whose next() method returns an 'example'
-whose fields are iteratable objects (which can iterate over the individual
-values of that field in the minibatch).
-"""
-raise NotImplementedError
 def fieldNames(self):
+#Yoshua-
+# This list may not be finite; what would make sense in the use you have
+# in mind?
+# -JB
 """Return the list of field names in the examples of this dataset."""
-raise NotImplementedError
+raise AbstractFunction()
 def rename(*new_field_specifications):
+#Yoshua-
+# Do you mean for this to be a virtual method?
+# Wouldn't this functionality be easier to provide via a
+# RenamingDataSet, such as the one I've written below?
+# -JB
 """
 Return a new dataset that maps old fields (of self) to new fields (of the returned
 dataset). The minimal syntax that should be supported is the following:
 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
 new_field_spec = ([old_field1, old_field2, ...], new_field)
 In general both old_field and new_field should be strings, but some datasets may also
 support additional indexing schemes within each field (e.g. column slice
 of a matrix-like field).
 """
-raise NotImplementedError
+raise AbstractFunction()
+class RenamingDataSet(DataSet):
+"""A DataSet that wraps another one, and makes it look like the field names
+are different
+Renaming is done by a dictionary that maps new names to the old ones used in
+self.src.
+"""
+def __init__(self, src, rename_dct):
+DataSet.__init__(self)
+self.src = src
+self.rename_dct = copy.copy(rename_dct)
+def minibatches(self,
+fieldnames = DataSet.minibatches_fieldnames,
+minibatch_size = DataSet.minibatches_minibatch_size,
+n_batches = DataSet.minibatches_n_batches):
+dct = self.rename_dct
+new_fieldnames = [dct.get(f, f) for f in fieldnames]
+return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)
+def fieldNames(self):
+return [dct.get(f, f) for f in self.src.fieldNames()]
 class FiniteDataSet(DataSet):
 """
 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
 Examples are indexed by an integer between 0 and self.length()-1,
 in an efficient random access way. Users are encouraged to expect only the generic dataset
 interface in general. A FiniteDataSet is mainly useful when one has to obtain
 a subset of examples (e.g. for splitting a dataset into training and test sets).
 """
+class FiniteDataSetIterator(object):
+"""
+If the fieldnames list is empty, it means that we want to see ALL the fields.
+"""
+def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
+self.dataset=dataset
+self.minibatch_size=minibatch_size
+assert minibatch_size>=1 and minibatch_size<=len(dataset)
+self.current = -self.minibatch_size
+self.fieldnames = fieldnames
+def __iter__(self):
+return self
+def next(self):
+self.current+=self.minibatch_size
+if self.current>=len(self.dataset):
+self.current=-self.minibatch_size
+raise StopIteration
+if self.minibatch_size==1:
+complete_example=self.dataset[self.current]
+else:
+complete_example=self.dataset[self.current:self.current+self.minibatch_size]
+if self.fieldnames:
+return Example(self.fieldnames,list(complete_example))
+else:
+return complete_example
 def __init__(self):
 pass
-def __iter__(self):
+def minibatches(self,
-return FiniteDataSetIterator(self)
+fieldnames = DataSet.minibatches_fieldnames,
+minibatch_size = DataSet.minibatches_minibatch_size,
-def zip(self,*fieldnames):
+n_batches = DataSet.minibatches_n_batches):
-return FiniteDataSetIterator(self,1,fieldnames)
+"""
+If the fieldnames list is empty, it means that we want to see ALL the fields.
-def minibatches(self,minibatch_size,*fieldnames):
-return FiniteDataSetIterator(self,minibatch_size,fieldnames)
+If the n_batches is empty, we want to see all the examples possible
+for the give minibatch_size.
+"""
+# substitute the defaults:
+if fieldnames is None: fieldnames = self.fieldNames()
+if n_batches is None: n_batches = len(self) / minibatch_size
+return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 def __getattr__(self,fieldname):
 """Return an that can iterate over the values of the field in this dataset."""
 return self(fieldname)
 """Return a sub-dataset containing only the given fieldnames as fields.
 The return value's default iterator will iterate only over the given
 fields.
 """
-raise NotImplementedError
+raise AbstractFunction()
 def __len__(self):
 """len(dataset) returns the number of examples in the dataset."""
-raise NotImplementedError
+raise AbstractFunction()
 def __getitem__(self,i):
 """dataset[i] returns the (i+1)-th example of the dataset."""
-raise NotImplementedError
+raise AbstractFunction()
 def __getslice__(self,*slice_args):
 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-raise NotImplementedError
+raise AbstractFunction()
-class FiniteDataSetIterator(object):
-"""
-If the fieldnames list is empty, it means that we want to see ALL the fields.
-"""
-def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
-self.dataset=dataset
-self.minibatch_size=minibatch_size
-assert minibatch_size>=1 and minibatch_size<=len(dataset)
-self.current = -self.minibatch_size
-self.fieldnames = fieldnames
-def __iter__(self):
-return self
-def next(self):
-self.current+=self.minibatch_size
-if self.current>=len(self.dataset):
-self.current=-self.minibatch_size
-raise StopIteration
-if self.minibatch_size==1:
-complete_example=self.dataset[self.current]
-else:
-complete_example=self.dataset[self.current:self.current+self.minibatch_size]
-if self.fieldnames:
-return Example(self.fieldnames,list(complete_example))
-else:
-return complete_example
 # we may want ArrayDataSet defined in another python file
 import numpy
+def as_array_dataset(dataset):
+# Generally datasets can be efficient by making data fields overlap, but
+# this function doesn't know which fields overlap.  So, it should check if
+# dataset supports an as_array_dataset member function, and return that if
+# possible.
+if hasattr(dataset, 'as_array_dataset'):
+return dataset.as_array_dataset()
+raise NotImplementedError()
+# Make ONE big minibatch with all the examples, to separate the fields.
+n_examples = len(dataset)
+batch = dataset.minibatches( minibatch_size = len(dataset)).next()
+# Each field of the underlying dataset must be convertible to a numpy array of the same type
+# currently just double, but should use the smallest compatible dtype
+n_fields = len(batch)
+fieldnames = batch.fields.keys()
+total_width = 0
+type = None
+fields = LookupList()
+for i in xrange(n_fields):
+field = array(batch[i])
+assert field.shape[0]==n_examples
+width = field.shape[1]
+start=total_width
+total_width += width
+fields[fieldnames[i]]=slice(start,total_width,1)
+# many complicated things remain to be done:
+#  - find common dtype
+#  - decide what to do with extra dimensions if not the same in all fields
+#  - try to see if we can avoid the copy?
 class ArrayDataSet(FiniteDataSet):
 """
 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
 from DataSet (and the ability to view multiple field values as an 'Example').
 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
 Any dataset can also be converted to a numpy array (losing the notion of fields
 by the numpy.array(dataset) call.
 """
-def __init__(self,dataset=None,data=None,fields={}):
+class Iterator(object):
+"""An iterator over a finite dataset that implements wrap-around"""
+def __init__(self, dataset, fieldnames, minibatch_size, next_max):
+self.dataset=dataset
+self.fieldnames = fieldnames
+self.minibatch_size=minibatch_size
+self.next_count = 0
+self.next_max = next_max
+self.current = -self.minibatch_size
+assert minibatch_size > 0
+if minibatch_size >= len(dataset):
+raise NotImplementedError()
+def __iter__(self):
+#Why do we do this?  -JB
+return self
+@staticmethod
+def matcat(a, b):
+a0, a1 = a.shape
+b0, b1 = b.shape
+assert a1 == b1
+assert a.dtype is b.dtype
+rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
+rval[:a0,:] = a
+rval[a0:,:] = b
+return rval
+def next(self):
+#check for end-of-loop
+self.next_count += 1
+if self.next_count == self.next_max:
+raise StopIteration
+#determine the first and last elements of the slice we'll return
+self.current += self.minibatch_size
+if self.current >= len(self.dataset):
+self.current -= len(self.dataset)
+upper = self.current + self.minibatch_size
+if upper <= len(self.dataset):
+#this is the easy case, we only need once slice
+dataview = self.dataset.data[self.current:upper]
+else:
+# the minibatch wraps around the end of the dataset
+dataview = self.dataset.data[self.current:]
+upper -= len(self.dataset)
+assert upper > 0
+dataview = self.matcat(dataview, self.dataset.data[:upper])
+rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames]
+if self.fieldnames:
+rval = Example(self.fieldnames, rval)
+return rval
+def __init__(self, data, fields=None):
 """
 There are two ways to construct an ArrayDataSet: (1) from an
 existing dataset (which may result in a copy of the data in a numpy array),
 or (2) from a numpy.array (the data argument), along with an optional description
-of the fields (dictionary of column slices indexed by field names).
+of the fields (a LookupList of column slices indexed by field names).
 """
-if dataset!=None:
+self.data=data
-assert data==None and fields=={}
+self.fields=fields
-# Make ONE big minibatch with all the examples, to separate the fields.
+rows, cols = data.shape
-n_examples=len(dataset)
-batch = dataset.minibatches(n_examples).next()
+if fields:
-# Each field of the underlying dataset must be convertible to a numpy array of the same type
+for fieldname,fieldslice in fields.items():
-# currently just double, but should use the smallest compatible dtype
-n_fields = len(batch)
-fieldnames = batch.fields.keys()
-total_width = 0
-type = None
-for i in xrange(n_fields):
-field = array(batch[i])
-assert field.shape[0]==n_examples
-width = field.shape[1]
-start=total_width
-total_width += width
-fields[fieldnames[i]]=slice(start,total_width,1)
-# many complicated things remain to be done:
-#  - find common dtype
-#  - decide what to do with extra dimensions if not the same in all fields
-#  - try to see if we can avoid the copy?
-raise NotImplementedError
-if data!=None:
-assert dataset==None
-self.data=data
-self.fields=fields
-self.width = data.shape[1]
-for fieldname in fields:
-fieldslice=fields[fieldname]
 # make sure fieldslice.start and fieldslice.step are defined
 start=fieldslice.start
 step=fieldslice.step
 if not start:
 start=0
 if not step:
 step=1
 if not fieldslice.start or not fieldslice.step:
 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
 # and coherent with the data array
-assert fieldslice.start>=0 and fieldslice.stop<=self.width
+assert fieldslice.start >= 0 and fieldslice.stop <= cols
+def minibatches(self,
+fieldnames = DataSet.minibatches_fieldnames,
+minibatch_size = DataSet.minibatches_minibatch_size,
+n_batches = DataSet.minibatches_n_batches):
+"""
+If the fieldnames list is empty, it means that we want to see ALL the fields.
+If the n_batches is empty, we want to see all the examples possible
+for the give minibatch_size.
+"""
+# substitute the defaults:
+if fieldnames is None: fieldnames = self.fieldNames()
+if n_batches is None: n_batches = len(self) / minibatch_size
+return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 def __getattr__(self,fieldname):
 """
 Return a numpy array with the content associated with the given field name.
 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
 min_col=self.data.shape[1]
 max_col=0
 for field_slice in self.fields.values():
 min_col=min(min_col,field_slice.start)
 max_col=max(max_col,field_slice.stop)
-new_fields={}
+new_fields=LookupList()
-for field in self.fields:
+for fieldname,fieldslice in self.fields.items():
-new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
+new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
-return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
+return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
 def fieldNames(self):
 """Return the list of field names that are supported by getattr and getFields."""
 return self.fields.keys()
 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
 the result is just a numpy array (for the i-th row of the dataset data matrix).
 """
 if self.fields:
 fieldnames,fieldslices=zip(*self.fields.items())
-return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
+return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
 else:
 return self.data[i]
-def __getslice__(self,*slice_args):
+def __getslice__(self,*args):
 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
-return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
+return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
 def __array__(self):
 """Return an view of this dataset which is an numpy.ndarray
 Numpy uses this special function name to retrieve an ndarray view for

Mercurial > pylearn

comparison dataset.py @ 17:759d17112b23