Mercurial > pylearn
view dataset.py @ 15:88168361a5ab
comment re: ArrayDataSet.__array__
author | bergstrj@iro.umontreal.ca |
---|---|
date | Tue, 25 Mar 2008 13:38:51 -0400 |
parents | de616c423dbd |
children | 813723310d75 |
line wrap: on
line source
class DataSet(object): """ This is a virtual base class or interface for datasets. A dataset is basically an iterator over examples. It does not necessarily have a fixed length (this is useful for 'streams' which feed on-line learning). Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. Examples and datasets optionally have named fields. One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. The content of a field can be of any type, but often will be a numpy array. The minibatch_size attribute, if different than 1, means that the iterator (next() method) returns not a single example but an array of length minibatch_size, i.e., an indexable object with minibatch_size examples in it. """ def __init__(self,minibatch_size=1): assert minibatch_size>0 self.minibatch_size=minibatch_size def __iter__(self): """ Return an iterator, whose next() method returns the next example or the next minibatch in the dataset. A minibatch (of length > 1) should be something one can iterate on again in order to obtain the individual examples. If the dataset has fields, then the example or the minibatch must have the same fields (typically this is implemented by returning another smaller dataset, when there are fields). """ raise NotImplementedError def __getattr__(self,fieldname): """Return a sub-dataset containing only the given fieldname as field.""" return self(fieldname) def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" raise NotImplementedError def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" raise NotImplementedError class FiniteDataSet(DataSet): """ Virtual interface, a subclass of DataSet for datasets which have a finite, known length. Examples are indexed by an integer between 0 and self.length()-1, and a subdataset can be obtained by slicing. """ def __init__(self,minibatch_size): DataSet.__init__(self,minibatch_size) def __iter__(self): return FiniteDataSetIterator(self) def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise NotImplementedError def __getitem__(self,i): """dataset[i] returns the (i+1)-th example of the dataset.""" raise NotImplementedError def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" raise NotImplementedError class FiniteDataSetIterator(object): def __init__(self,dataset): self.dataset=dataset self.current = -self.dataset.minibatch_size def next(self): """ Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that many examples. If the dataset has fields, the example or the minibatch of examples is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate example-wise on it. On the other hand, if the dataset has no fields (e.g. because it is already the field of a bigger dataset), then the returned example or minibatch may be any indexable object, such as a numpy array. Following the array semantics of indexing and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding to a row. Again, if the minibatch_size is >1, one can iterate on the result to obtain individual examples (as rows). """ self.current+=self.dataset.minibatch_size if self.current>=len(self.dataset): self.current=-self.dataset.minibatch_size raise StopIteration if self.dataset.minibatch_size==1: return self.dataset[self.current] else: return self.dataset[self.current:self.current+self.dataset.minibatch_size] # we may want ArrayDataSet defined in another python file import numpy class ArrayDataSet(FiniteDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of fields and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset in which each element is a numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields must correspond to a slice of array columns. If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. Any dataset can also be converted to a numpy array (losing the notion of fields and of minibatch_size) by the numpy.array(dataset) call. """ def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): """ There are two ways to construct an ArrayDataSet: (1) from an existing dataset (which may result in a copy of the data in a numpy array), or (2) from a numpy.array (the data argument), along with an optional description of the fields (dictionary of column slices indexed by field names). """ FiniteDataSet.__init__(self,minibatch_size) if dataset!=None: assert data==None and fields=={} # convert dataset to an ArrayDataSet raise NotImplementedError if data!=None: assert dataset==None self.data=data self.fields=fields self.width = data.shape[1] for fieldname in fields: fieldslice=fields[fieldname] # make sure fieldslice.start and fieldslice.step are defined start=fieldslice.start step=fieldslice.step if not start: start=0 if not step: step=1 if not fieldslice.start or not fieldslice.step: fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) # and coherent with the data array assert fieldslice.start>=0 and fieldslice.stop<=self.width assert minibatch_size<=len(self.data) def __getattr__(self,fieldname): """ Return a numpy array with the content associated with the given field name. If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension than the dataset.data) is returned. """ if len(self.data)==1: return self.data[0,self.fields[fieldname]] return self.data[:,self.fields[fieldname]] def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" min_col=self.data.shape[1] max_col=0 for field_slice in self.fields.values(): min_col=min(min_col,field_slice.start) max_col=max(max_col,field_slice.stop) new_fields={} for field in self.fields: new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" return self.fields.keys() def __len__(self): """len(dataset) returns the number of examples in the dataset.""" return len(self.data) def __getitem__(self,i): """ dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields then a one-example dataset is returned (to be able to handle example.field accesses). """ if self.fields: if isinstance(i,slice): return ArrayDataSet(data=data[slice],fields=self.fields) return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) else: return self.data[i] def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) def __array__(self): """Return an view of this dataset which is an numpy.ndarray Numpy uses this special function name to retrieve an ndarray view for function such as numpy.sum, numpy.dot, numpy.asarray, etc. If this dataset has no fields, then we simply return self.data, otherwise things are complicated. - why do we want this behaviour when there are fields? (JB) """ if not self.fields: return self.data # else, select subsets of columns mapped by the fields columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) for field_slice in self.fields.values(): for c in xrange(field_slice.start,field_slice.stop,field_slice.step): columns_used[c]=True # try to figure out if we can map all the slices into one slice: mappable_to_one_slice = True start=0 while start<len(columns_used) and not columns_used[start]: start+=1 stop=len(columns_used) while stop>0 and not columns_used[stop-1]: stop-=1 step=0 i=start while i<stop: j=i+1 while j<stop and not columns_used[j]: j+=1 if step: if step!=j-i: mappable_to_one_slice = False break else: step = j-i i=j if mappable_to_one_slice: return self.data[:,slice(start,stop,step)] # else make contiguous copy n_columns = sum(columns_used) result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) print result.shape c=0 for field_slice in self.fields.values(): slice_width=field_slice.stop-field_slice.start/field_slice.step # copy the field here result[:,slice(c,slice_width)]=self.data[:,field_slice] c+=slice_width return result