Mercurial > pylearn
view dataset.py @ 11:be128b9127c8
Debugged (to the extent of my tests) the new version of dataset
author | bengioy@esprit.iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 15:01:30 -0400 |
parents | de616c423dbd |
children | ff4e551490f1 813723310d75 |
line wrap: on
line source
class Example(object): """ An example is something that is like a tuple but whose elements can be named, to that following syntactic constructions work as one would expect: example.x = [1, 2, 3] # set a field x, y, z = example x = example[0] x = example["x"] """ def __init__(self,names,values): assert len(values)==len(names) self.__dict__['values']=values self.__dict__['fields']={} for i in xrange(len(values)): self.fields[names[i]]=i def __getitem__(self,i): if isinstance(i,int): return self.values[i] else: return self.values[self.fields[i]] def __setitem__(self,i,value): if isinstance(i,int): self.values[i]=value else: self.values[self.fields[i]]=value def __getattr__(self,name): return self.values[self.fields[name]] def __setattr__(self,name,value): self.values[self.fields[name]]=value def __len__(self): return len(self.values) class DataSet(object): """ This is a virtual base class or interface for datasets. A dataset is basically an iterator over Examples (or anything that behaves like an Example). It does not necessarily have a fixed length (this is useful for 'streams' which feed on-line learning). Datasets with fixed and known length are instances of FiniteDataSet, a subclass which supports indexing (dataset[i]) and slicing (dataset[1000:2000]). To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...) method which returns an iterator over only the desired fields. Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. The content of a field can be of any type, but often will be a numpy array. If one iterates through minibatches of examples (with the minibatches() method or with the minibatch_size argument of the zip() method), then the fields returned by the iterator's next method should be iterators over the individual values within the minibatch (typically these will be arrays with minibatch_size rows). """ def __init__(self): pass def __iter__(self): """ Return an iterator, whose next() method returns the next example or the next minibatch in the dataset. A minibatch (of length > 1) is also an example, but whose fields should be something one can iterate on again in order to obtain the individual examples. """ raise NotImplementedError def zip(self,*fieldnames): """ Return an iterator which sees only the specified fields (each fieldname is a field key, typically a string). The value returned at each iteration is a tuple with one element per field. Hence it can be used like this: for f1, f2, f3 in dataset.zip('field1','field2','field3'): ... use f1, f2, and f3 """ raise NotImplementedError def minibatches(self,minibatch_size,*fieldnames): """ Similar to zip but iterate over minibatches. Return a minibatch iterator, whose next() method returns an 'example' whose fields are iteratable objects (which can iterate over the individual values of that field in the minibatch). """ raise NotImplementedError def fieldNames(self): """Return the list of field names in the examples of this dataset.""" raise NotImplementedError def rename(*new_field_specifications): """ Return a new dataset that maps old fields (of self) to new fields (of the returned dataset). The minimal syntax that should be supported is the following: new_field_specifications = [new_field_spec1, new_field_spec2, ...] new_field_spec = ([old_field1, old_field2, ...], new_field) In general both old_field and new_field should be strings, but some datasets may also support additional indexing schemes within each field (e.g. column slice of a matrix-like field). """ raise NotImplementedError class FiniteDataSet(DataSet): """ Virtual interface, a subclass of DataSet for datasets which have a finite, known length. Examples are indexed by an integer between 0 and self.length()-1, and a subdataset can be obtained by slicing. This may not be appropriate in general but only for datasets which can be thought of like ones that access rows AND fields in an efficient random access way. Users are encouraged to expect only the generic dataset interface in general. A FiniteDataSet is mainly useful when one has to obtain a subset of examples (e.g. for splitting a dataset into training and test sets). """ def __init__(self): pass def __iter__(self): return FiniteDataSetIterator(self) def zip(self,*fieldnames): return FiniteDataSetIterator(self,1,fieldnames) def minibatches(self,minibatch_size,*fieldnames): return FiniteDataSetIterator(self,minibatch_size,fieldnames) def __getattr__(self,fieldname): """Return an that can iterate over the values of the field in this dataset.""" return self(fieldname) def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" raise NotImplementedError def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise NotImplementedError def __getitem__(self,i): """dataset[i] returns the (i+1)-th example of the dataset.""" raise NotImplementedError def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" raise NotImplementedError class FiniteDataSetIterator(object): """ If the fieldnames list is empty, it means that we want to see ALL the fields. """ def __init__(self,dataset,minibatch_size=1,fieldnames=[]): self.dataset=dataset self.minibatch_size=minibatch_size assert minibatch_size>=1 and minibatch_size<=len(dataset) self.current = -self.minibatch_size self.fieldnames = fieldnames def __iter__(self): return self def next(self): self.current+=self.minibatch_size if self.current>=len(self.dataset): self.current=-self.minibatch_size raise StopIteration if self.minibatch_size==1: complete_example=self.dataset[self.current] else: complete_example=self.dataset[self.current:self.current+self.minibatch_size] if self.fieldnames: return Example(self.fieldnames,list(complete_example)) else: return complete_example # we may want ArrayDataSet defined in another python file import numpy class ArrayDataSet(FiniteDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of named fields from DataSet (and the ability to view multiple field values as an 'Example'). It is a fixed-length and fixed-width dataset in which each element is a numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields must correspond to a slice of array columns. If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. Any dataset can also be converted to a numpy array (losing the notion of fields by the numpy.array(dataset) call. """ def __init__(self,dataset=None,data=None,fields={}): """ There are two ways to construct an ArrayDataSet: (1) from an existing dataset (which may result in a copy of the data in a numpy array), or (2) from a numpy.array (the data argument), along with an optional description of the fields (dictionary of column slices indexed by field names). """ if dataset!=None: assert data==None and fields=={} # Make ONE big minibatch with all the examples, to separate the fields. n_examples=len(dataset) batch = dataset.minibatches(n_examples).next() # Each field of the underlying dataset must be convertible to a numpy array of the same type # currently just double, but should use the smallest compatible dtype n_fields = len(batch) fieldnames = batch.fields.keys() total_width = 0 type = None for i in xrange(n_fields): field = array(batch[i]) assert field.shape[0]==n_examples width = field.shape[1] start=total_width total_width += width fields[fieldnames[i]]=slice(start,total_width,1) # many complicated things remain to be done: # - find common dtype # - decide what to do with extra dimensions if not the same in all fields # - try to see if we can avoid the copy? raise NotImplementedError if data!=None: assert dataset==None self.data=data self.fields=fields self.width = data.shape[1] for fieldname in fields: fieldslice=fields[fieldname] # make sure fieldslice.start and fieldslice.step are defined start=fieldslice.start step=fieldslice.step if not start: start=0 if not step: step=1 if not fieldslice.start or not fieldslice.step: fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) # and coherent with the data array assert fieldslice.start>=0 and fieldslice.stop<=self.width def __getattr__(self,fieldname): """ Return a numpy array with the content associated with the given field name. If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension than the dataset itself) is returned. """ if len(self.data)==1: return self.data[0,self.fields[fieldname]] return self.data[:,self.fields[fieldname]] def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" min_col=self.data.shape[1] max_col=0 for field_slice in self.fields.values(): min_col=min(min_col,field_slice.start) max_col=max(max_col,field_slice.stop) new_fields={} for field in self.fields: new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" return self.fields.keys() def __len__(self): """len(dataset) returns the number of examples in the dataset.""" return len(self.data) def __getitem__(self,i): """ dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix). """ if self.fields: fieldnames,fieldslices=zip(*self.fields.items()) return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) else: return self.data[i] def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) def __array__(self): if not self.fields: return self.data # else, select subsets of columns mapped by the fields columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) for field_slice in self.fields.values(): for c in xrange(field_slice.start,field_slice.stop,field_slice.step): columns_used[c]=True # try to figure out if we can map all the slices into one slice: mappable_to_one_slice = True start=0 while start<len(columns_used) and not columns_used[start]: start+=1 stop=len(columns_used) while stop>0 and not columns_used[stop-1]: stop-=1 step=0 i=start while i<stop: j=i+1 while j<stop and not columns_used[j]: j+=1 if step: if step!=j-i: mappable_to_one_slice = False break else: step = j-i i=j if mappable_to_one_slice: return self.data[:,slice(start,stop,step)] # else make contiguous copy n_columns = sum(columns_used) result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) print result.shape c=0 for field_slice in self.fields.values(): slice_width=field_slice.stop-field_slice.start/field_slice.step # copy the field here result[:,slice(c,slice_width)]=self.data[:,field_slice] c+=slice_width return result