# HG changeset patch # User bengioy@grenat.iro.umontreal.ca # Date 1207933731 14400 # Node ID 541a273bc89f91bb459c988ad48ac038c9b5d1cb # Parent e6c550cb2896af72e4993ce0e8dfcf5272af0074 Removed __array__ method from dataset, whose semantics did not have a clear use (because of the possibility of overlapping fields). diff -r e6c550cb2896 -r 541a273bc89f _test_dataset.py --- a/_test_dataset.py Fri Apr 11 11:16:09 2008 -0400 +++ b/_test_dataset.py Fri Apr 11 13:08:51 2008 -0400 @@ -48,16 +48,6 @@ a_y = a.y self.failUnless(numpy.all( a_y == arr[:,1:4])) - def test_asarray(self): - arr = numpy.random.rand(3,4) - a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(2,4)}) - a_arr = numpy.asarray(a) - self.failUnless(a_arr.shape[1] == 2 + 2) - self.failUnless(numpy.sum(numpy.square(a_arr-a.data))==0) - a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)}) - a_arr = numpy.asarray(a) - self.failUnless(a_arr.shape[1] == 2 + 3) - def test_minibatch_wraparound_even(self): arr = numpy.random.rand(10,4) arr2 = ArrayDataSet.Iterator.matcat(arr,arr) diff -r e6c550cb2896 -r 541a273bc89f dataset.py --- a/dataset.py Fri Apr 11 11:16:09 2008 -0400 +++ b/dataset.py Fri Apr 11 13:08:51 2008 -0400 @@ -37,8 +37,8 @@ Datasets of finite length should be sub-classes of FiniteLengthDataSet. - Datasets whose elements can be indexed and sub-datasets of consecutive - examples (i.e. slices) can be extracted from should be sub-classes of + Datasets whose elements can be indexed and whose sub-datasets (with a subset + of examples) can be extracted should be sub-classes of SliceableDataSet. Datasets with a finite number of fields should be sub-classes of @@ -230,8 +230,10 @@ Virtual interface, a subclass of DataSet for datasets which are sliceable and whose individual elements can be accessed, generally respecting the python semantics for [spec], where spec is either a non-negative integer - (for selecting one example), or a python slice (for selecting a sub-dataset - comprising the specified examples). This is useful for obtaining + (for selecting one example), a python slice(start,stop,step) for selecting a regular + sub-dataset comprising examples start,start+step,start+2*step,...,n (with n= 0 and fieldslice.stop <= cols + assert type(fieldslice) is int or isinstance(fieldslice,slice) or hasattr(fieldslice,"__iter__") + if hasattr(fieldslice,"__iter__"): # is a sequence + for i in fieldslice: + assert type(i) is int + elif isinstance(fieldslice,slice): + # make sure fieldslice.start and fieldslice.step are defined + start=fieldslice.start + step=fieldslice.step + if not start: + start=0 + if not step: + step=1 + if not fieldslice.start or not fieldslice.step: + fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) + # and coherent with the data array + assert fieldslice.start >= 0 and fieldslice.stop <= cols def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, @@ -469,15 +485,7 @@ def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" - min_col=self.data.shape[1] - max_col=0 - for field_slice in self.fields.values(): - min_col=min(min_col,field_slice.start) - max_col=max(max_col,field_slice.stop) - new_fields=LookupList() - for fieldname,fieldslice in self.fields.items(): - new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step) - return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields) + return ArrayDataSet(self.data,fields=LookupList(fieldnames,[self.fields[fieldname] for fieldname in fieldnames])) def fieldNames(self): """Return the list of field names that are supported by getattr and hasField.""" @@ -489,8 +497,11 @@ def __getitem__(self,i): """ - dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields - the result is just a numpy array (for the i-th row of the dataset data matrix). + dataset[i] returns the (i+1)-th Example of the dataset. + If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix). + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in. """ if self.fields: fieldnames,fieldslices=zip(*self.fields.items()) @@ -499,36 +510,34 @@ return self.data[i] def __getslice__(self,*args): - """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" + """ + dataset[i:j] returns the subdataset with examples i,i+1,...,j-1. + dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2. + """ return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) - def __array__(self): - """Return a view of this dataset which is an numpy.ndarray (i.e. losing - the identity and name of fields within the dataset). - - Numpy uses this special function name to retrieve an ndarray view for - function such as numpy.sum, numpy.dot, numpy.asarray, etc. - - If this dataset has no fields, then we simply return self.data, - otherwise things are complicated. - - why do we want this behaviour when there are fields? (JB) - - for convenience and completeness (but maybe it would make - more sense to implement this through a 'field-merging' - dataset). (YB) + def indices_of_unique_columns_used(self): + """ + Return the unique indices of the columns actually used by the fields, and a boolean + that signals (if True) that used columns overlap. If they do then the + indices are not repeated in the result. """ - if not self.fields: - return self.data - # else, select subsets of columns mapped by the fields columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) - overlapping_fields = False - n_columns = 0 + overlapping_columns = False for field_slice in self.fields.values(): - for c in xrange(field_slice.start,field_slice.stop,field_slice.step): - n_columns += 1 - if columns_used[c]: overlapping_fields=True - columns_used[c]=True - # try to figure out if we can map all the slices into one slice: - mappable_to_one_slice = not overlapping_fields + if sum(columns_used[field_slice])>0: overlapping_columns=True + columns_used[field_slice]=True + return [i for i,used in enumerate(columns_used) if used],overlapping_columns + + def slice_of_unique_columns_used(self): + """ + Return None if the indices_of_unique_columns_used do not form a slice. If they do, + return that slice. It means that the columns used can be extracted + from the data array without making a copy. If the fields overlap + but their unique columns used form a slice, still return that slice. + """ + columns_used,overlapping_columns = self.indices_of_columns_used() + mappable_to_one_slice = True if not overlapping_fields: start=0 while start