pylearn: dataset.py comparison

comparison dataset.py @ 28:541a273bc89f

Removed __array__ method from dataset, whose semantics did not have a clear use (because of the possibility of overlapping fields).

author	bengioy@grenat.iro.umontreal.ca
date	Fri, 11 Apr 2008 13:08:51 -0400
parents	672fe4b23032
children	46c5c90019c2

comparison

equal deleted inserted replaced

-:e6c550cb2896
+:541a273bc89f
 values can be computed on-demand, when particular field names are used in one of the
 iterators).
 Datasets of finite length should be sub-classes of FiniteLengthDataSet.
-Datasets whose elements can be indexed and sub-datasets of consecutive
+Datasets whose elements can be indexed and whose sub-datasets (with a subset
-examples (i.e. slices) can be extracted from should be sub-classes of
+of examples) can be extracted should be sub-classes of
 SliceableDataSet.
 Datasets with a finite number of fields should be sub-classes of
 FiniteWidthDataSet.
 """
 class SliceableDataSet(DataSet):
 """
 Virtual interface, a subclass of DataSet for datasets which are sliceable
 and whose individual elements can be accessed, generally respecting the
 python semantics for [spec], where spec is either a non-negative integer
-(for selecting one example), or a python slice (for selecting a sub-dataset
+(for selecting one example), a python slice(start,stop,step) for selecting a regular
-comprising the specified examples). This is useful for obtaining
+sub-dataset comprising examples start,start+step,start+2*step,...,n (with n<stop), or a
+sequence (e.g. a list) of integers [i1,i2,...,in] for selecting
+an arbitrary subset of examples. This is useful for obtaining
 sub-datasets, e.g. for splitting a dataset into training and test sets.
 """
 def __init__(self):
 DataSet.__init__(self)
 # substitute the defaults:
 if n_batches is None: n_batches = len(self) / minibatch_size
 return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 def __getitem__(self,i):
-"""dataset[i] returns the (i+1)-th example of the dataset."""
+"""
+dataset[i] returns the (i+1)-th example of the dataset.
+dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
+"""
 raise AbstractFunction()
 def __getslice__(self,*slice_args):
-"""dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
+"""
+dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+"""
 raise AbstractFunction()
 class FiniteWidthDataSet(DataSet):
 """
 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
 from DataSet (and the ability to view the values of multiple fields as an 'Example').
 It is a  fixed-length and fixed-width dataset
 in which each element is a fixed dimension numpy array or a number, hence the whole
 dataset corresponds to a numpy array. Fields
-must correspond to a slice of array columns. If the dataset has fields,
+must correspond to a slice of array columns or to a list of column numbers.
+If the dataset has fields,
 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
 Any dataset can also be converted to a numpy array (losing the notion of fields
 by the numpy.array(dataset) call.
 """
 #check for end-of-loop
 self.next_count += 1
 if self.next_count == self.next_max:
 raise StopIteration
-#determine the first and last elements of the slice we'll return
+#determine the first and last elements of the minibatch slice we'll return
 n_rows = self.dataset.data.shape[0]
 self.current = self.next_index()
 upper = self.current + self.minibatch_size
 data = self.dataset.data
 def __init__(self, data, fields=None):
 """
 There are two ways to construct an ArrayDataSet: (1) from an
 existing dataset (which may result in a copy of the data in a numpy array),
 or (2) from a numpy.array (the data argument), along with an optional description
-of the fields (a LookupList of column slices indexed by field names).
+of the fields (a LookupList of column slices (or column lists) indexed by field names).
 """
 self.data=data
 self.fields=fields
 rows, cols = data.shape
 if fields:
 for fieldname,fieldslice in fields.items():
-# make sure fieldslice.start and fieldslice.step are defined
+assert type(fieldslice) is int or isinstance(fieldslice,slice) or hasattr(fieldslice,"__iter__")
-start=fieldslice.start
+if hasattr(fieldslice,"__iter__"): # is a sequence
-step=fieldslice.step
+for i in fieldslice:
-if not start:
+assert type(i) is int
-start=0
+elif isinstance(fieldslice,slice):
-if not step:
+# make sure fieldslice.start and fieldslice.step are defined
-step=1
+start=fieldslice.start
-if not fieldslice.start or not fieldslice.step:
+step=fieldslice.step
-fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
+if not start:
-# and coherent with the data array
+start=0
-assert fieldslice.start >= 0 and fieldslice.stop <= cols
+if not step:
+step=1
+if not fieldslice.start or not fieldslice.step:
+fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
+# and coherent with the data array
+assert fieldslice.start >= 0 and fieldslice.stop <= cols
 def minibatches(self,
 fieldnames = DataSet.minibatches_fieldnames,
 minibatch_size = DataSet.minibatches_minibatch_size,
 n_batches = DataSet.minibatches_n_batches):
 return self.data[0,self.fields[fieldname]]
 return self.data[:,self.fields[fieldname]]
 def __call__(self,*fieldnames):
 """Return a sub-dataset containing only the given fieldnames as fields."""
-min_col=self.data.shape[1]
+return ArrayDataSet(self.data,fields=LookupList(fieldnames,[self.fields[fieldname] for fieldname in fieldnames]))
-max_col=0
-for field_slice in self.fields.values():
-min_col=min(min_col,field_slice.start)
-max_col=max(max_col,field_slice.stop)
-new_fields=LookupList()
-for fieldname,fieldslice in self.fields.items():
-new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
-return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
 def fieldNames(self):
 """Return the list of field names that are supported by getattr and hasField."""
 return self.fields.keys()
 """len(dataset) returns the number of examples in the dataset."""
 return len(self.data)
 def __getitem__(self,i):
 """
-dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
+dataset[i] returns the (i+1)-th Example of the dataset.
-the result is just a numpy array (for the i-th row of the dataset data matrix).
+If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix).
+dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
 """
 if self.fields:
 fieldnames,fieldslices=zip(*self.fields.items())
 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
 else:
 return self.data[i]
 def __getslice__(self,*args):
-"""dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
+"""
+dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+"""
 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
-def __array__(self):
+def indices_of_unique_columns_used(self):
-"""Return a view of this dataset which is an numpy.ndarray (i.e. losing
+"""
-the identity and name of fields within the dataset).
+Return the unique indices of the columns actually used by the fields, and a boolean
+that signals (if True) that used columns overlap. If they do then the
-Numpy uses this special function name to retrieve an ndarray view for
+indices are not repeated in the result.
-function such as numpy.sum, numpy.dot, numpy.asarray, etc.
+"""
-If this dataset has no fields, then we simply return self.data,
-otherwise things are complicated.
-- why do we want this behaviour when there are fields? (JB)
-- for convenience and completeness (but maybe it would make
-more sense to implement this through a 'field-merging'
-dataset). (YB)
-"""
-if not self.fields:
-return self.data
-# else, select subsets of columns mapped by the fields
 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
-overlapping_fields = False
+overlapping_columns = False
-n_columns = 0
 for field_slice in self.fields.values():
-for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
+if sum(columns_used[field_slice])>0: overlapping_columns=True
-n_columns += 1
+columns_used[field_slice]=True
-if columns_used[c]: overlapping_fields=True
+return [i for i,used in enumerate(columns_used) if used],overlapping_columns
-columns_used[c]=True
-# try to figure out if we can map all the slices into one slice:
+def slice_of_unique_columns_used(self):
-mappable_to_one_slice = not overlapping_fields
+"""
+Return None if the indices_of_unique_columns_used do not form a slice. If they do,
+return that slice. It means that the columns used can be extracted
+from the data array without making a copy. If the fields overlap
+but their unique columns used form a slice, still return that slice.
+"""
+columns_used,overlapping_columns = self.indices_of_columns_used()
+mappable_to_one_slice = True
 if not overlapping_fields:
 start=0
 while start<len(columns_used) and not columns_used[start]:
 start+=1
 stop=len(columns_used)
 mappable_to_one_slice = False
 break
 else:
 step = j-i
 i=j
-if mappable_to_one_slice:
+return slice(start,stop,step)
-return self.data[:,slice(start,stop,step)]
-# else make contiguous copy (copying the overlapping columns)
-result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
-c=0
-for field_slice in self.fields.values():
-slice_width=(field_slice.stop-field_slice.start)/field_slice.step
-# copy the field here
-result[:,slice(c,c+slice_width)]=self.data[:,field_slice]
-c+=slice_width
-return result
 class ApplyFunctionDataSet(DataSet):
 """
 A dataset that contains as fields the results of applying
 a given function (example-wise) to specified input_fields of a source
 dataset. The function should return a sequence whose elements will be stored in

Mercurial > pylearn

comparison dataset.py @ 28:541a273bc89f