# HG changeset patch # User bengioy@bengiomac.local # Date 1206379215 14400 # Node ID 6f8f338686db42cd171c41a0cbaa3a880e2bec8d # Parent d5738b79089a6446846ca69a7757698765f416ac Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset. diff -r d5738b79089a -r 6f8f338686db _test_dataset.py --- a/_test_dataset.py Mon Mar 24 09:04:06 2008 -0400 +++ b/_test_dataset.py Mon Mar 24 13:20:15 2008 -0400 @@ -16,14 +16,24 @@ a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)},minibatch_size=1) s=0 for example in a: + print len(example), example.x s+=_sum_all(example.x) print s - self.failUnless(abs(s-11.4674133)<1e-6) + self.failUnless(abs(s-7.25967597)<1e-6) + + def test1(self): + a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)},minibatch_size=1) a.minibatch_size=2 + print a.asarray() for mb in a: + print mb,mb.asarray() + print "a.y=",a.y + for mb in ArrayDataSet(data=a.y,minibatch_size=2): print mb - - + for e in mb: + print e + self.failUnless(True) + if __name__ == '__main__': unittest.main() diff -r d5738b79089a -r 6f8f338686db dataset.py --- a/dataset.py Mon Mar 24 09:04:06 2008 -0400 +++ b/dataset.py Mon Mar 24 13:20:15 2008 -0400 @@ -20,14 +20,11 @@ self.minibatch_size=minibatch_size def __iter__(self): - return self - - def next(self): """ - Return the next example or the next minibatch in the dataset. - A minibatch (of length > 1) should be something one can iterate on again in order - to obtain the individual examples. If the dataset has fields, - then the example or the minibatch must have the same fields + Return an iterator, whose next() method returns the next example or the next + minibatch in the dataset. A minibatch (of length > 1) should be something one + can iterate on again in order to obtain the individual examples. If the dataset + has fields, then the example or the minibatch must have the same fields (typically this is implemented by returning another (small) dataset, when there are fields). """ @@ -55,6 +52,9 @@ def __init__(self,minibatch_size): DataSet.__init__(self,minibatch_size) + def __iter__(self): + return FiniteDataSetIterator(self) + def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise NotImplementedError @@ -67,6 +67,35 @@ """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" raise NotImplementedError +class FiniteDataSetIterator(object): + def __init__(self,dataset): + self.dataset=dataset + self.current = -self.dataset.minibatch_size + + def next(self): + """ + Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that + many examples. If the dataset has fields, the example or the minibatch of examples + is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), + but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate + example-wise on it. On the other hand, if the dataset has no fields (e.g. because + it is already the field of a bigger dataset), then the returned example or minibatch + may be any indexable object, such as a numpy array. Following the array semantics of indexing + and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array + with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding + to a row. Again, if the minibatch_size is >1, one can iterate on the result to + obtain individual examples (as rows). + """ + self.current+=self.dataset.minibatch_size + if self.current>=len(self.dataset): + self.current=-self.dataset.minibatch_size + raise StopIteration + if self.dataset.minibatch_size==1: + return self.dataset[self.current] + else: + return self.dataset[self.current:self.current+self.dataset.minibatch_size] + + # we may want ArrayDataSet defined in another python file import numpy @@ -88,7 +117,6 @@ a dictionary of column slices indexed by field names). """ FiniteDataSet.__init__(self,minibatch_size) - self.current_row=-1 # used for view of this dataset as an iterator if dataset!=None: assert data==None and fields=={} # convert dataset to an ArrayDataSet @@ -108,43 +136,20 @@ if not step: step=1 if not fieldslice.start or not fieldslice.step: - fieldslice = slice(start,fieldslice.stop,step) + fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) # and coherent with the data array assert fieldslice.start>=0 and fieldslice.stop<=self.width assert minibatch_size<=len(self.data) - def next(self): - """ - Return the next example(s) in the dataset. If self.minibatch_size>1 return that - many examples. If the dataset has fields, the example or the minibatch of examples - is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), - but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate - example-wise on it. On the other hand, if the dataset has no fields (e.g. because - it is already the field of a bigger dataset), then the returned example or minibatch - is a numpy array. Following the array semantics of indexing and slicing, - if the minibatch_size is 1 (and there are no fields), then the result is an array - with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding - to a row. Again, if the minibatch_size is >1, one can iterate on the result to - obtain individual examples (as rows). + def __getattr__(self,fieldname): """ - if self.fields: - self.current_row+=self.minibatch_size - if self.current_row>=len(self.data): - self.current_row=-self.minibatch_size - raise StopIteration - if self.minibatch_size==1: - return self[self.current_row] - else: - return self[self.current_row:self.current_row+self.minibatch_size] - else: - if self.minibatch_size==1: - return self.data[self.current_row] - else: - return self.data[self.current_row:self.current_row+self.minibatch_size] - - def __getattr__(self,fieldname): - """Return a numpy array with the content associated with the given field name.""" - return self.data[self.fields[fieldname]] + Return a numpy array with the content associated with the given field name. + If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension + than the dataset.data) is returned. + """ + if len(self.data)==1: + return self.data[0,self.fields[fieldname]] + return self.data[:,self.fields[fieldname]] def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" @@ -176,49 +181,51 @@ return ArrayDataSet(data=data[slice],fields=self.fields) return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) else: - return data[i] + return self.data[i] def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) def asarray(self): - if self.fields: - columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) - for field_slice in self.fields.values(): - for c in xrange(field_slice.start,field_slice.stop,field_slice.step): - columns_used[c]=True - # try to figure out if we can map all the slices into one slice: - mappable_to_one_slice = True - start=0 - while start0 and not columns_used[stop-1]: - stop-=1 - step=0 - i=start - while i0 and not columns_used[stop-1]: + stop-=1 + step=0 + i=start + while i