pylearn: dataset.py comparison

comparison dataset.py @ 16:813723310d75

commenting

author	bergstrj@iro.umontreal.ca
date	Wed, 26 Mar 2008 18:23:44 -0400
parents	88168361a5ab be128b9127c8
children	759d17112b23

comparison

equal deleted inserted replaced

-:88168361a5ab
+:813723310d75
+class Example(object):
+"""
+An example is something that is like a tuple but whose elements can be named, to that
+following syntactic constructions work as one would expect:
+example.x = [1, 2, 3] # set a field
+x, y, z = example
+x = example[0]
+x = example["x"]
+"""
+def __init__(self,names,values):
+assert len(values)==len(names)
+self.__dict__['values']=values
+self.__dict__['fields']={}
+for i in xrange(len(values)):
+self.fields[names[i]]=i
+def __getitem__(self,i):
+if isinstance(i,int):
+return self.values[i]
+else:
+return self.values[self.fields[i]]
+def __setitem__(self,i,value):
+if isinstance(i,int):
+self.values[i]=value
+else:
+self.values[self.fields[i]]=value
+def __getattr__(self,name):
+return self.values[self.fields[name]]
+def __setattr__(self,name,value):
+self.values[self.fields[name]]=value
+def __len__(self):
+return len(self.values)
 class DataSet(object):
-"""
+"""A virtual base class for datasets.
-This is a virtual base class or interface for datasets.
-A dataset is basically an iterator over examples. It does not necessarily
+A DataSet is a generator of iterators; these iterators can run through the
-have a fixed length (this is useful for 'streams' which feed on-line learning).
+examples in a variety of ways.  A DataSet need not necessarily have a finite
-Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet.
+or known length, so this class can be used to interface to a 'stream' which
-Examples and datasets optionally have named fields.
+feed on-line learning.
-One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...).
-Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+To iterate over examples, there are several possibilities:
-The content of a field can be of any type, but often will be a numpy array.
+- for i in dataset.zip(field1, field2,field3, ...)
-The minibatch_size attribute, if different than 1, means that the iterator (next() method)
+- for i in dataset.minibatches(N, field1, field2, ...)
-returns not a single example but an array of length minibatch_size, i.e., an indexable
+- for i in dataset
-object with minibatch_size examples in it.
+Each of these is documented below.
-"""
+Note: For a dataset of fixed and known length, which can implement item
-def __init__(self,minibatch_size=1):
+random-access efficiently (e.g. indexing and slicing), and which can profit
-assert minibatch_size>0
+from the FiniteDataSetIterator, consider using base class FiniteDataSet.
-self.minibatch_size=minibatch_size
+Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
+Note: The content of a field can be of any type.
+"""
+def __init__(self):
+pass
 def __iter__(self):
-"""
+"""Supports the syntax "for i in dataset: ..."
+Using this syntax, "i" will be an Example instance (or equivalent) with
+all the fields of DataSet self.  Every field of "i" will give access to
+a the field of a single example.  Fields should be accessible via
+i[identifier], but the derived class is free to accept any type of
+identifier, and add extra functionality to the iterator.
+"""
+raise NotImplementedError
+def zip(self, *fieldnames):
+"""
+Supports two forms of syntax:
+for i in dataset.zip(f1, f2, f3): ...
+for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+Using the first syntax, "i" will be an indexable object, such as a list,
+tuple, or Example instance, such that on every iteration, i[0] is the f1
+field of the current example, i[1] is the f2 field, and so on.
+Using the second syntax, i1, i2, i3 will contain the the contents of the
+f1, f2, and f3 fields of a single example on each loop iteration.
+The derived class may accept fieldname arguments of any type.
+"""
+raise NotImplementedError
+def minibatches(self,minibatch_size,*fieldnames):
+"""
+Supports two forms of syntax:
+for i in dataset.zip(f1, f2, f3): ...
+for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+Using the first syntax, "i" will be an indexable object, such as a list,
+tuple, or Example instance, such that on every iteration, i[0] is the f1
+field of the current example, i[1] is the f2 field, and so on.
+Using the second syntax, i1, i2, i3 will contain the the contents of the
+f1, f2, and f3 fields of a single example on each loop iteration.
+The derived class may accept fieldname arguments of any type.
 Return an iterator, whose next() method returns the next example or the next
-minibatch in the dataset. A minibatch (of length > 1) should be something one
+minibatch in the dataset. A minibatch (of length > 1) is also an example, but
-can iterate on again in order to obtain the individual examples. If the dataset
+whose fields should be something one can iterate on again in order to obtain
-has fields, then the example or the minibatch must have the same fields
+the individual examples.
-(typically this is implemented by returning another smaller dataset, when
-there are fields).
+DataSet.zip returns an iterator over only the desired fields, and each field
-"""
+of the iterator contains one example.
-raise NotImplementedError
+Return an iterator which sees only the specified fields (each fieldname is a
-def __getattr__(self,fieldname):
+field key, typically a string). The value returned at each iteration
-"""Return a sub-dataset containing only the given fieldname as field."""
+is a tuple with one element per field. Hence it can be used like this:
-return self(fieldname)
+for f1, f2, f3 in dataset.zip('field1','field2','field3'):
+... use f1, f2, and f3
-def __call__(self,*fieldnames):
+If one iterates through minibatches of examples (with the minibatches() method
-"""Return a sub-dataset containing only the given fieldnames as fields."""
+or with the minibatch_size argument of the zip() method), then the fields
-raise NotImplementedError
+returned by the iterator's next method should be iterators over the
+individual values within the minibatch (typically these will be arrays
+with minibatch_size rows).
+Similar to zip but iterate over minibatches.
+Return a minibatch iterator, whose next() method returns an 'example'
+whose fields are iteratable objects (which can iterate over the individual
+values of that field in the minibatch).
+"""
+raise NotImplementedError
 def fieldNames(self):
-"""Return the list of field names that are supported by getattr and getFields."""
+"""Return the list of field names in the examples of this dataset."""
+raise NotImplementedError
+def rename(*new_field_specifications):
+"""
+Return a new dataset that maps old fields (of self) to new fields (of the returned
+dataset). The minimal syntax that should be supported is the following:
+new_field_specifications = [new_field_spec1, new_field_spec2, ...]
+new_field_spec = ([old_field1, old_field2, ...], new_field)
+In general both old_field and new_field should be strings, but some datasets may also
+support additional indexing schemes within each field (e.g. column slice
+of a matrix-like field).
+"""
 raise NotImplementedError
 class FiniteDataSet(DataSet):
 """
 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
 Examples are indexed by an integer between 0 and self.length()-1,
-and a subdataset can be obtained by slicing.
+and a subdataset can be obtained by slicing. This may not be appropriate in general
-"""
+but only for datasets which can be thought of like ones that access rows AND fields
+in an efficient random access way. Users are encouraged to expect only the generic dataset
-def __init__(self,minibatch_size):
+interface in general. A FiniteDataSet is mainly useful when one has to obtain
-DataSet.__init__(self,minibatch_size)
+a subset of examples (e.g. for splitting a dataset into training and test sets).
+"""
+def __init__(self):
+pass
 def __iter__(self):
 return FiniteDataSetIterator(self)
+def zip(self,*fieldnames):
+return FiniteDataSetIterator(self,1,fieldnames)
+def minibatches(self,minibatch_size,*fieldnames):
+return FiniteDataSetIterator(self,minibatch_size,fieldnames)
+def __getattr__(self,fieldname):
+"""Return an that can iterate over the values of the field in this dataset."""
+return self(fieldname)
+def __call__(self,*fieldnames):
+"""Return a sub-dataset containing only the given fieldnames as fields.
+The return value's default iterator will iterate only over the given
+fields.
+"""
+raise NotImplementedError
 def __len__(self):
 """len(dataset) returns the number of examples in the dataset."""
 raise NotImplementedError
 def __getitem__(self,i):
 def __getslice__(self,*slice_args):
 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
 raise NotImplementedError
 class FiniteDataSetIterator(object):
-def __init__(self,dataset):
+"""
+If the fieldnames list is empty, it means that we want to see ALL the fields.
+"""
+def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
 self.dataset=dataset
-self.current = -self.dataset.minibatch_size
+self.minibatch_size=minibatch_size
+assert minibatch_size>=1 and minibatch_size<=len(dataset)
+self.current = -self.minibatch_size
+self.fieldnames = fieldnames
+def __iter__(self):
+return self
 def next(self):
-"""
+self.current+=self.minibatch_size
-Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that
-many examples. If the dataset has fields, the example or the minibatch of examples
-is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
-but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
-example-wise on it. On the other hand, if the dataset has no fields (e.g. because
-it is already the field of a bigger dataset), then the returned example or minibatch
-may be any indexable object, such as a numpy array. Following the array semantics of indexing
-and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array
-with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
-to a row. Again, if the minibatch_size is >1, one can iterate on the result to
-obtain individual examples (as rows).
-"""
-self.current+=self.dataset.minibatch_size
 if self.current>=len(self.dataset):
-self.current=-self.dataset.minibatch_size
+self.current=-self.minibatch_size
 raise StopIteration
-if self.dataset.minibatch_size==1:
+if self.minibatch_size==1:
-return self.dataset[self.current]
+complete_example=self.dataset[self.current]
 else:
-return self.dataset[self.current:self.current+self.dataset.minibatch_size]
+complete_example=self.dataset[self.current:self.current+self.minibatch_size]
+if self.fieldnames:
+return Example(self.fieldnames,list(complete_example))
+else:
+return complete_example
 # we may want ArrayDataSet defined in another python file
 import numpy
 class ArrayDataSet(FiniteDataSet):
 """
-An ArrayDataSet behaves like a numpy array but adds the notion of fields
+An ArrayDataSet behaves like a numpy array but adds the notion of named fields
-and minibatch_size from DataSet. It is a  fixed-length and fixed-width dataset
+from DataSet (and the ability to view multiple field values as an 'Example').
+It is a  fixed-length and fixed-width dataset
 in which each element is a numpy array or a number, hence the whole
 dataset corresponds to a numpy array. Fields
 must correspond to a slice of array columns. If the dataset has fields,
 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
 Any dataset can also be converted to a numpy array (losing the notion of fields
-and of minibatch_size) by the numpy.array(dataset) call.
+by the numpy.array(dataset) call.
 """
-def __init__(self,dataset=None,data=None,fields={},minibatch_size=1):
+def __init__(self,dataset=None,data=None,fields={}):
 """
-	There are two ways to construct an ArrayDataSet: (1) from an
+There are two ways to construct an ArrayDataSet: (1) from an
-	existing dataset (which may result in a copy of the data in a numpy array),
+existing dataset (which may result in a copy of the data in a numpy array),
-	or (2) from a numpy.array (the data argument), along with an optional description
+or (2) from a numpy.array (the data argument), along with an optional description
-	of the fields (dictionary of column slices indexed by field names).
+of the fields (dictionary of column slices indexed by field names).
 """
-FiniteDataSet.__init__(self,minibatch_size)
 if dataset!=None:
 assert data==None and fields=={}
-# convert dataset to an ArrayDataSet
+# Make ONE big minibatch with all the examples, to separate the fields.
+n_examples=len(dataset)
+batch = dataset.minibatches(n_examples).next()
+# Each field of the underlying dataset must be convertible to a numpy array of the same type
+# currently just double, but should use the smallest compatible dtype
+n_fields = len(batch)
+fieldnames = batch.fields.keys()
+total_width = 0
+type = None
+for i in xrange(n_fields):
+field = array(batch[i])
+assert field.shape[0]==n_examples
+width = field.shape[1]
+start=total_width
+total_width += width
+fields[fieldnames[i]]=slice(start,total_width,1)
+# many complicated things remain to be done:
+#  - find common dtype
+#  - decide what to do with extra dimensions if not the same in all fields
+#  - try to see if we can avoid the copy?
 raise NotImplementedError
 if data!=None:
 assert dataset==None
 self.data=data
 self.fields=fields
 step=1
 if not fieldslice.start or not fieldslice.step:
 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
 # and coherent with the data array
 assert fieldslice.start>=0 and fieldslice.stop<=self.width
-assert minibatch_size<=len(self.data)
 def __getattr__(self,fieldname):
 """
 Return a numpy array with the content associated with the given field name.
 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
-than the dataset.data) is returned.
+than the dataset itself) is returned.
 """
 if len(self.data)==1:
 return self.data[0,self.fields[fieldname]]
 return self.data[:,self.fields[fieldname]]
 min_col=min(min_col,field_slice.start)
 max_col=max(max_col,field_slice.stop)
 new_fields={}
 for field in self.fields:
 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
-return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size)
+return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
 def fieldNames(self):
 """Return the list of field names that are supported by getattr and getFields."""
 return self.fields.keys()
 """len(dataset) returns the number of examples in the dataset."""
 return len(self.data)
 def __getitem__(self,i):
 """
-dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields
+dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
-then a one-example dataset is returned (to be able to handle example.field accesses).
+the result is just a numpy array (for the i-th row of the dataset data matrix).
 """
 if self.fields:
-if isinstance(i,slice):
+fieldnames,fieldslices=zip(*self.fields.items())
-return ArrayDataSet(data=data[slice],fields=self.fields)
+return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
-return ArrayDataSet(data=self.data[i:i+1],fields=self.fields)
 else:
 return self.data[i]
 def __getslice__(self,*slice_args):
 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
 slice_width=field_slice.stop-field_slice.start/field_slice.step
 # copy the field here
 result[:,slice(c,slice_width)]=self.data[:,field_slice]
 c+=slice_width
 return result

Mercurial > pylearn

comparison dataset.py @ 16:813723310d75