Mercurial > pylearn
changeset 17:759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author | bergstrj@iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 21:05:14 -0400 |
parents | 813723310d75 (diff) ff4e551490f1 (current diff) |
children | 60b164a0d84a |
files | _test_dataset.py dataset.py lookup_list.py |
diffstat | 3 files changed, 375 insertions(+), 147 deletions(-) [+] |
line wrap: on
line diff
--- a/_test_dataset.py Wed Mar 26 18:21:57 2008 -0400 +++ b/_test_dataset.py Wed Mar 26 21:05:14 2008 -0400 @@ -12,28 +12,67 @@ def setUp(self): numpy.random.seed(123456) - def test0(self): - a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)}) - s=0 - for example in a: - s+=_sum_all(example.x) - #print s - self.failUnless(abs(s-7.25967597)<1e-6) + + def test_ctor_len(self): + n = numpy.random.rand(8,3) + a=ArrayDataSet(n) + self.failUnless(a.data is n) + self.failUnless(a.fields is None) + + self.failUnless(len(a) == n.shape[0]) + self.failUnless(a[0].shape == (n.shape[1],)) + + def test_iter(self): + arr = numpy.random.rand(8,3) + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)}) + for i, example in enumerate(a): + self.failUnless(numpy.all( example.x == arr[i,:2])) + self.failUnless(numpy.all( example.y == arr[i,1:3])) + + def test_zip(self): + arr = numpy.random.rand(8,3) + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,3)}) + for i, x in enumerate(a.zip("x")): + self.failUnless(numpy.all( x == arr[i,:2])) + + def test_minibatch_basic(self): + arr = numpy.random.rand(10,4) + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) + for i, mb in enumerate(a.minibatches(minibatch_size=2)): #all fields + self.failUnless(numpy.all( mb.x == arr[i*2:i*2+2,0:2])) + self.failUnless(numpy.all( mb.y == arr[i*2:i*2+2,1:4])) - def test1(self): - a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)}) - s=0 - for mb in a.minibatches(2): - s+=_sum_all(numpy.array(mb)) - s+=a[3:6].x[1,1] - for mb in ArrayDataSet(data=a.y).minibatches(2): - for e in mb: - s+=sum(e) - #print numpy.array(a) - #print a.y[4:9:2] - s+= _sum_all(a.y[4:9:2]) - #print s - self.failUnless(abs(s-39.0334797)<1e-6) + def test_getattr(self): + arr = numpy.random.rand(10,4) + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) + a_y = a.y + self.failUnless(numpy.all( a_y == arr[:,1:4])) + + def test_asarray(self): + arr = numpy.random.rand(3,4) + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) + a_arr = numpy.asarray(a) + self.failUnless(a_arr.shape[1] == 2 + 3) + + def test_minibatch_wraparound_even(self): + arr = numpy.random.rand(10,4) + arr2 = ArrayDataSet.Iterator.matcat(arr,arr) + + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) + + #print arr + for i, x in enumerate(a.minibatches(["x"], minibatch_size=2, n_batches=8)): + #print 'x' , x + self.failUnless(numpy.all( x == arr2[i*2:i*2+2,0:2])) + + def test_minibatch_wraparound_odd(self): + arr = numpy.random.rand(10,4) + arr2 = ArrayDataSet.Iterator.matcat(arr,arr) + + a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) + + for i, x in enumerate(a.minibatches(["x"], minibatch_size=3, n_batches=6)): + self.failUnless(numpy.all( x == arr2[i*3:i*3+3,0:2])) if __name__ == '__main__': unittest.main()
--- a/dataset.py Wed Mar 26 18:21:57 2008 -0400 +++ b/dataset.py Wed Mar 26 21:05:14 2008 -0400 @@ -1,62 +1,126 @@ from lookup_list import LookupList Example = LookupList + +class AbstractFunction (Exception): """Derived class must override this function""" class DataSet(object): - """ - This is a virtual base class or interface for datasets. - A dataset is basically an iterator over Examples (or anything that - behaves like an Example). It does not necessarily - have a fixed length (this is useful for 'streams' which feed on-line learning). - Datasets with fixed and known length are instances of FiniteDataSet, a subclass - which supports indexing (dataset[i]) and slicing (dataset[1000:2000]). - To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...) - method which returns an iterator over only the desired fields. - Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. - The content of a field can be of any type, but often will be a numpy array. - If one iterates through minibatches of examples (with the minibatches() method - or with the minibatch_size argument of the zip() method), then the fields - returned by the iterator's next method should be iterators over the - individual values within the minibatch (typically these will be arrays - with minibatch_size rows). + """A virtual base class for datasets. + + A DataSet is a generator of iterators; these iterators can run through the + examples in a variety of ways. A DataSet need not necessarily have a finite + or known length, so this class can be used to interface to a 'stream' which + feed on-line learning. + + To iterate over examples, there are several possibilities: + - for i in dataset.zip(field1, field2,field3, ...) + - for i in dataset.minibatches(N, field1, field2, ...) + - for i in dataset + Each of these is documented below. + + Note: For a dataset of fixed and known length, which can implement item + random-access efficiently (e.g. indexing and slicing), and which can profit + from the FiniteDataSetIterator, consider using base class FiniteDataSet. + + Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. + + Note: The content of a field can be of any type. + """ def __init__(self): pass def __iter__(self): + """Supports the syntax "for i in dataset: ..." + + Using this syntax, "i" will be an Example instance (or equivalent) with + all the fields of DataSet self. Every field of "i" will give access to + a the field of a single example. Fields should be accessible via + i[identifier], but the derived class is free to accept any type of + identifier, and add extra functionality to the iterator. """ - Return an iterator, whose next() method returns the next example or the next - minibatch in the dataset. A minibatch (of length > 1) is also an example, but - whose fields should be something one can iterate on again in order to obtain - the individual examples. + for i in self.minibatches( minibatch_size = 1): + yield Example(i.keys(), [v[0] for v in i.values()]) + + def zip(self, *fieldnames): """ - raise NotImplementedError + Supports two forms of syntax: + + for i in dataset.zip(f1, f2, f3): ... + + for i1, i2, i3 in dataset.zip(f1, f2, f3): ... - def zip(self,*fieldnames): + Using the first syntax, "i" will be an indexable object, such as a list, + tuple, or Example instance, such that on every iteration, i[0] is the f1 + field of the current example, i[1] is the f2 field, and so on. + + Using the second syntax, i1, i2, i3 will contain the the contents of the + f1, f2, and f3 fields of a single example on each loop iteration. + + The derived class may accept fieldname arguments of any type. + """ - Return an iterator which sees only the specified fields (each fieldname is a - field key, typically a string). The value returned at each iteration - is a tuple with one element per field. Hence it can be used like this: - for f1, f2, f3 in dataset.zip('field1','field2','field3'): - ... use f1, f2, and f3 + for i in self.minibatches(fieldnames, minibatch_size = 1): + yield [f[0] for f in i] + + minibatches_fieldnames = None + minibatches_minibatch_size = 1 + minibatches_n_batches = None + def minibatches(self, + fieldnames = minibatches_fieldnames, + minibatch_size = minibatches_minibatch_size, + n_batches = minibatches_n_batches): """ - raise NotImplementedError + Supports two forms of syntax: + + for i in dataset.minibatches([f1, f2, f3],**kwargs): ... + + for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... + + Using the first syntax, "i" will be an indexable object, such as a list, + tuple, or Example instance, such that on every iteration, i[0] is a + list-like container of the f1 field of a batch current examples, i[1] is + a list-like container of the f2 field, etc. - def minibatches(self,minibatch_size,*fieldnames): + Using the second syntax, i1, i2, i3 will be list-like containers of the + f1, f2, and f3 fields of a batch of examples on each loop iteration. + + PARAMETERS + - fieldnames (list of any type, default None): + The loop variables i1, i2, i3 (in the example above) should contain the + f1, f2, and f3 fields of the current batch of examples. If None, the + derived class can choose a default, e.g. all fields. + + - minibatch_size (integer, default 1) + On every iteration, the variables i1, i2, i3 will have + exactly minibatch_size elements. e.g. len(i1) == minibatch_size + + - n_batches (integer, default None) + The iterator will loop exactly this many times, and then stop. If None, + the derived class can choose a default. If (-1), then the returned + iterator should support looping indefinitely. + + Note: A list-like container is something like a tuple, list, numpy.ndarray or + any other object that supports integer indexing and slicing. + """ - Similar to zip but iterate over minibatches. - Return a minibatch iterator, whose next() method returns an 'example' - whose fields are iteratable objects (which can iterate over the individual - values of that field in the minibatch). - """ - raise NotImplementedError + raise AbstractFunction() def fieldNames(self): + #Yoshua- + # This list may not be finite; what would make sense in the use you have + # in mind? + # -JB """Return the list of field names in the examples of this dataset.""" - raise NotImplementedError + raise AbstractFunction() def rename(*new_field_specifications): + #Yoshua- + # Do you mean for this to be a virtual method? + # Wouldn't this functionality be easier to provide via a + # RenamingDataSet, such as the one I've written below? + # -JB """ Return a new dataset that maps old fields (of self) to new fields (of the returned dataset). The minimal syntax that should be supported is the following: @@ -66,7 +130,31 @@ support additional indexing schemes within each field (e.g. column slice of a matrix-like field). """ - raise NotImplementedError + raise AbstractFunction() + +class RenamingDataSet(DataSet): + """A DataSet that wraps another one, and makes it look like the field names + are different + + Renaming is done by a dictionary that maps new names to the old ones used in + self.src. + """ + def __init__(self, src, rename_dct): + DataSet.__init__(self) + self.src = src + self.rename_dct = copy.copy(rename_dct) + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + dct = self.rename_dct + new_fieldnames = [dct.get(f, f) for f in fieldnames] + return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) + + def fieldNames(self): + return [dct.get(f, f) for f in self.src.fieldNames()] + class FiniteDataSet(DataSet): """ @@ -79,71 +167,113 @@ a subset of examples (e.g. for splitting a dataset into training and test sets). """ + class FiniteDataSetIterator(object): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. + """ + def __init__(self,dataset,minibatch_size=1,fieldnames=[]): + self.dataset=dataset + self.minibatch_size=minibatch_size + assert minibatch_size>=1 and minibatch_size<=len(dataset) + self.current = -self.minibatch_size + self.fieldnames = fieldnames + + def __iter__(self): + return self + + def next(self): + self.current+=self.minibatch_size + if self.current>=len(self.dataset): + self.current=-self.minibatch_size + raise StopIteration + if self.minibatch_size==1: + complete_example=self.dataset[self.current] + else: + complete_example=self.dataset[self.current:self.current+self.minibatch_size] + if self.fieldnames: + return Example(self.fieldnames,list(complete_example)) + else: + return complete_example + def __init__(self): pass - def __iter__(self): - return FiniteDataSetIterator(self) - - def zip(self,*fieldnames): - return FiniteDataSetIterator(self,1,fieldnames) + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. - def minibatches(self,minibatch_size,*fieldnames): - return FiniteDataSetIterator(self,minibatch_size,fieldnames) + If the n_batches is empty, we want to see all the examples possible + for the give minibatch_size. + """ + # substitute the defaults: + if fieldnames is None: fieldnames = self.fieldNames() + if n_batches is None: n_batches = len(self) / minibatch_size + return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) def __getattr__(self,fieldname): """Return an that can iterate over the values of the field in this dataset.""" return self(fieldname) def __call__(self,*fieldnames): - """Return a sub-dataset containing only the given fieldnames as fields.""" - raise NotImplementedError + """Return a sub-dataset containing only the given fieldnames as fields. + + The return value's default iterator will iterate only over the given + fields. + """ + raise AbstractFunction() def __len__(self): """len(dataset) returns the number of examples in the dataset.""" - raise NotImplementedError + raise AbstractFunction() def __getitem__(self,i): """dataset[i] returns the (i+1)-th example of the dataset.""" - raise NotImplementedError + raise AbstractFunction() def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" - raise NotImplementedError - -class FiniteDataSetIterator(object): - """ - If the fieldnames list is empty, it means that we want to see ALL the fields. - """ - def __init__(self,dataset,minibatch_size=1,fieldnames=[]): - self.dataset=dataset - self.minibatch_size=minibatch_size - assert minibatch_size>=1 and minibatch_size<=len(dataset) - self.current = -self.minibatch_size - self.fieldnames = fieldnames - - def __iter__(self): - return self - - def next(self): - self.current+=self.minibatch_size - if self.current>=len(self.dataset): - self.current=-self.minibatch_size - raise StopIteration - if self.minibatch_size==1: - complete_example=self.dataset[self.current] - else: - complete_example=self.dataset[self.current:self.current+self.minibatch_size] - if self.fieldnames: - return Example(self.fieldnames,list(complete_example)) - else: - return complete_example - + raise AbstractFunction() # we may want ArrayDataSet defined in another python file import numpy +def as_array_dataset(dataset): + # Generally datasets can be efficient by making data fields overlap, but + # this function doesn't know which fields overlap. So, it should check if + # dataset supports an as_array_dataset member function, and return that if + # possible. + if hasattr(dataset, 'as_array_dataset'): + return dataset.as_array_dataset() + + raise NotImplementedError() + + # Make ONE big minibatch with all the examples, to separate the fields. + n_examples = len(dataset) + batch = dataset.minibatches( minibatch_size = len(dataset)).next() + + # Each field of the underlying dataset must be convertible to a numpy array of the same type + # currently just double, but should use the smallest compatible dtype + n_fields = len(batch) + fieldnames = batch.fields.keys() + total_width = 0 + type = None + fields = LookupList() + for i in xrange(n_fields): + field = array(batch[i]) + assert field.shape[0]==n_examples + width = field.shape[1] + start=total_width + total_width += width + fields[fieldnames[i]]=slice(start,total_width,1) + # many complicated things remain to be done: + # - find common dtype + # - decide what to do with extra dimensions if not the same in all fields + # - try to see if we can avoid the copy? + class ArrayDataSet(FiniteDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of named fields @@ -157,55 +287,105 @@ by the numpy.array(dataset) call. """ - def __init__(self,dataset=None,data=None,fields=None): + class Iterator(object): + """An iterator over a finite dataset that implements wrap-around""" + def __init__(self, dataset, fieldnames, minibatch_size, next_max): + self.dataset=dataset + self.fieldnames = fieldnames + self.minibatch_size=minibatch_size + self.next_count = 0 + self.next_max = next_max + self.current = -self.minibatch_size + assert minibatch_size > 0 + if minibatch_size >= len(dataset): + raise NotImplementedError() + + def __iter__(self): + #Why do we do this? -JB + return self + + @staticmethod + def matcat(a, b): + a0, a1 = a.shape + b0, b1 = b.shape + assert a1 == b1 + assert a.dtype is b.dtype + rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype) + rval[:a0,:] = a + rval[a0:,:] = b + return rval + + def next(self): + + #check for end-of-loop + self.next_count += 1 + if self.next_count == self.next_max: + raise StopIteration + + #determine the first and last elements of the slice we'll return + self.current += self.minibatch_size + if self.current >= len(self.dataset): + self.current -= len(self.dataset) + upper = self.current + self.minibatch_size + + if upper <= len(self.dataset): + #this is the easy case, we only need once slice + dataview = self.dataset.data[self.current:upper] + else: + # the minibatch wraps around the end of the dataset + dataview = self.dataset.data[self.current:] + upper -= len(self.dataset) + assert upper > 0 + dataview = self.matcat(dataview, self.dataset.data[:upper]) + + + rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames] + + if self.fieldnames: + rval = Example(self.fieldnames, rval) + + return rval + + + def __init__(self, data, fields=None): """ There are two ways to construct an ArrayDataSet: (1) from an existing dataset (which may result in a copy of the data in a numpy array), or (2) from a numpy.array (the data argument), along with an optional description of the fields (a LookupList of column slices indexed by field names). """ - if dataset!=None: - assert data==None and fields==None - # Make ONE big minibatch with all the examples, to separate the fields. - n_examples=len(dataset) - batch = dataset.minibatches(n_examples).next() - # Each field of the underlying dataset must be convertible to a numpy array of the same type - # currently just double, but should use the smallest compatible dtype - n_fields = len(batch) - fieldnames = batch.fields.keys() - total_width = 0 - type = None - fields = LookupList() - for i in xrange(n_fields): - field = array(batch[i]) - assert field.shape[0]==n_examples - width = field.shape[1] - start=total_width - total_width += width - fields[fieldnames[i]]=slice(start,total_width,1) - # many complicated things remain to be done: - # - find common dtype - # - decide what to do with extra dimensions if not the same in all fields - # - try to see if we can avoid the copy? - raise NotImplementedError - if data!=None: - assert dataset==None - self.data=data - self.fields=fields - self.width = data.shape[1] - if fields: - for fieldname,fieldslice in fields.items(): - # make sure fieldslice.start and fieldslice.step are defined - start=fieldslice.start - step=fieldslice.step - if not start: - start=0 - if not step: - step=1 - if not fieldslice.start or not fieldslice.step: - fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) - # and coherent with the data array - assert fieldslice.start>=0 and fieldslice.stop<=self.width + self.data=data + self.fields=fields + rows, cols = data.shape + + if fields: + for fieldname,fieldslice in fields.items(): + # make sure fieldslice.start and fieldslice.step are defined + start=fieldslice.start + step=fieldslice.step + if not start: + start=0 + if not step: + step=1 + if not fieldslice.start or not fieldslice.step: + fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) + # and coherent with the data array + assert fieldslice.start >= 0 and fieldslice.stop <= cols + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. + + If the n_batches is empty, we want to see all the examples possible + for the give minibatch_size. + """ + # substitute the defaults: + if fieldnames is None: fieldnames = self.fieldNames() + if n_batches is None: n_batches = len(self) / minibatch_size + return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) def __getattr__(self,fieldname): """ @@ -227,7 +407,7 @@ new_fields=LookupList() for fieldname,fieldslice in self.fields.items(): new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step) - return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) + return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields) def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" @@ -248,11 +428,20 @@ else: return self.data[i] - def __getslice__(self,*slice_args): + def __getslice__(self,*args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" - return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) + return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) def __array__(self): + """Return an view of this dataset which is an numpy.ndarray + + Numpy uses this special function name to retrieve an ndarray view for + function such as numpy.sum, numpy.dot, numpy.asarray, etc. + + If this dataset has no fields, then we simply return self.data, + otherwise things are complicated. + - why do we want this behaviour when there are fields? (JB) + """ if not self.fields: return self.data # else, select subsets of columns mapped by the fields
--- a/lookup_list.py Wed Mar 26 18:21:57 2008 -0400 +++ b/lookup_list.py Wed Mar 26 21:05:14 2008 -0400 @@ -22,10 +22,10 @@ self._name2index[names[i]]=i def keys(self): - return _names + return self._names def values(self): - return _values + return self._values def items(self): return zip(self._names,self._values)