Mercurial > pylearn
diff dataset.py @ 17:759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author | bergstrj@iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 21:05:14 -0400 |
parents | 813723310d75 ff4e551490f1 |
children | 57f4015e2e09 |
line wrap: on
line diff
--- a/dataset.py Wed Mar 26 18:23:44 2008 -0400 +++ b/dataset.py Wed Mar 26 21:05:14 2008 -0400 @@ -1,42 +1,9 @@ -class Example(object): - """ - An example is something that is like a tuple but whose elements can be named, to that - following syntactic constructions work as one would expect: - example.x = [1, 2, 3] # set a field - x, y, z = example - x = example[0] - x = example["x"] - """ - def __init__(self,names,values): - assert len(values)==len(names) - self.__dict__['values']=values - self.__dict__['fields']={} - for i in xrange(len(values)): - self.fields[names[i]]=i - - def __getitem__(self,i): - if isinstance(i,int): - return self.values[i] - else: - return self.values[self.fields[i]] - - def __setitem__(self,i,value): - if isinstance(i,int): - self.values[i]=value - else: - self.values[self.fields[i]]=value +from lookup_list import LookupList +Example = LookupList - def __getattr__(self,name): - return self.values[self.fields[name]] - - def __setattr__(self,name,value): - self.values[self.fields[name]]=value - - def __len__(self): - return len(self.values) - - +class AbstractFunction (Exception): """Derived class must override this function""" + class DataSet(object): """A virtual base class for datasets. @@ -73,7 +40,8 @@ i[identifier], but the derived class is free to accept any type of identifier, and add extra functionality to the iterator. """ - raise NotImplementedError + for i in self.minibatches( minibatch_size = 1): + yield Example(i.keys(), [v[0] for v in i.values()]) def zip(self, *fieldnames): """ @@ -93,55 +61,66 @@ The derived class may accept fieldname arguments of any type. """ - raise NotImplementedError + for i in self.minibatches(fieldnames, minibatch_size = 1): + yield [f[0] for f in i] - def minibatches(self,minibatch_size,*fieldnames): + minibatches_fieldnames = None + minibatches_minibatch_size = 1 + minibatches_n_batches = None + def minibatches(self, + fieldnames = minibatches_fieldnames, + minibatch_size = minibatches_minibatch_size, + n_batches = minibatches_n_batches): """ Supports two forms of syntax: - for i in dataset.zip(f1, f2, f3): ... + for i in dataset.minibatches([f1, f2, f3],**kwargs): ... - for i1, i2, i3 in dataset.zip(f1, f2, f3): ... + for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... Using the first syntax, "i" will be an indexable object, such as a list, - tuple, or Example instance, such that on every iteration, i[0] is the f1 - field of the current example, i[1] is the f2 field, and so on. - - Using the second syntax, i1, i2, i3 will contain the the contents of the - f1, f2, and f3 fields of a single example on each loop iteration. - - The derived class may accept fieldname arguments of any type. + tuple, or Example instance, such that on every iteration, i[0] is a + list-like container of the f1 field of a batch current examples, i[1] is + a list-like container of the f2 field, etc. - Return an iterator, whose next() method returns the next example or the next - minibatch in the dataset. A minibatch (of length > 1) is also an example, but - whose fields should be something one can iterate on again in order to obtain - the individual examples. + Using the second syntax, i1, i2, i3 will be list-like containers of the + f1, f2, and f3 fields of a batch of examples on each loop iteration. - DataSet.zip returns an iterator over only the desired fields, and each field - of the iterator contains one example. + PARAMETERS + - fieldnames (list of any type, default None): + The loop variables i1, i2, i3 (in the example above) should contain the + f1, f2, and f3 fields of the current batch of examples. If None, the + derived class can choose a default, e.g. all fields. - Return an iterator which sees only the specified fields (each fieldname is a - field key, typically a string). The value returned at each iteration - is a tuple with one element per field. Hence it can be used like this: - for f1, f2, f3 in dataset.zip('field1','field2','field3'): - ... use f1, f2, and f3 - If one iterates through minibatches of examples (with the minibatches() method - or with the minibatch_size argument of the zip() method), then the fields - returned by the iterator's next method should be iterators over the - individual values within the minibatch (typically these will be arrays - with minibatch_size rows). - Similar to zip but iterate over minibatches. - Return a minibatch iterator, whose next() method returns an 'example' - whose fields are iteratable objects (which can iterate over the individual - values of that field in the minibatch). + - minibatch_size (integer, default 1) + On every iteration, the variables i1, i2, i3 will have + exactly minibatch_size elements. e.g. len(i1) == minibatch_size + + - n_batches (integer, default None) + The iterator will loop exactly this many times, and then stop. If None, + the derived class can choose a default. If (-1), then the returned + iterator should support looping indefinitely. + + Note: A list-like container is something like a tuple, list, numpy.ndarray or + any other object that supports integer indexing and slicing. + """ - raise NotImplementedError + raise AbstractFunction() def fieldNames(self): + #Yoshua- + # This list may not be finite; what would make sense in the use you have + # in mind? + # -JB """Return the list of field names in the examples of this dataset.""" - raise NotImplementedError + raise AbstractFunction() def rename(*new_field_specifications): + #Yoshua- + # Do you mean for this to be a virtual method? + # Wouldn't this functionality be easier to provide via a + # RenamingDataSet, such as the one I've written below? + # -JB """ Return a new dataset that maps old fields (of self) to new fields (of the returned dataset). The minimal syntax that should be supported is the following: @@ -151,7 +130,31 @@ support additional indexing schemes within each field (e.g. column slice of a matrix-like field). """ - raise NotImplementedError + raise AbstractFunction() + +class RenamingDataSet(DataSet): + """A DataSet that wraps another one, and makes it look like the field names + are different + + Renaming is done by a dictionary that maps new names to the old ones used in + self.src. + """ + def __init__(self, src, rename_dct): + DataSet.__init__(self) + self.src = src + self.rename_dct = copy.copy(rename_dct) + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + dct = self.rename_dct + new_fieldnames = [dct.get(f, f) for f in fieldnames] + return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) + + def fieldNames(self): + return [dct.get(f, f) for f in self.src.fieldNames()] + class FiniteDataSet(DataSet): """ @@ -164,17 +167,51 @@ a subset of examples (e.g. for splitting a dataset into training and test sets). """ + class FiniteDataSetIterator(object): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. + """ + def __init__(self,dataset,minibatch_size=1,fieldnames=[]): + self.dataset=dataset + self.minibatch_size=minibatch_size + assert minibatch_size>=1 and minibatch_size<=len(dataset) + self.current = -self.minibatch_size + self.fieldnames = fieldnames + + def __iter__(self): + return self + + def next(self): + self.current+=self.minibatch_size + if self.current>=len(self.dataset): + self.current=-self.minibatch_size + raise StopIteration + if self.minibatch_size==1: + complete_example=self.dataset[self.current] + else: + complete_example=self.dataset[self.current:self.current+self.minibatch_size] + if self.fieldnames: + return Example(self.fieldnames,list(complete_example)) + else: + return complete_example + def __init__(self): pass - def __iter__(self): - return FiniteDataSetIterator(self) - - def zip(self,*fieldnames): - return FiniteDataSetIterator(self,1,fieldnames) + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. - def minibatches(self,minibatch_size,*fieldnames): - return FiniteDataSetIterator(self,minibatch_size,fieldnames) + If the n_batches is empty, we want to see all the examples possible + for the give minibatch_size. + """ + # substitute the defaults: + if fieldnames is None: fieldnames = self.fieldNames() + if n_batches is None: n_batches = len(self) / minibatch_size + return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) def __getattr__(self,fieldname): """Return an that can iterate over the values of the field in this dataset.""" @@ -186,53 +223,57 @@ The return value's default iterator will iterate only over the given fields. """ - raise NotImplementedError + raise AbstractFunction() def __len__(self): """len(dataset) returns the number of examples in the dataset.""" - raise NotImplementedError + raise AbstractFunction() def __getitem__(self,i): """dataset[i] returns the (i+1)-th example of the dataset.""" - raise NotImplementedError + raise AbstractFunction() def __getslice__(self,*slice_args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" - raise NotImplementedError - -class FiniteDataSetIterator(object): - """ - If the fieldnames list is empty, it means that we want to see ALL the fields. - """ - def __init__(self,dataset,minibatch_size=1,fieldnames=[]): - self.dataset=dataset - self.minibatch_size=minibatch_size - assert minibatch_size>=1 and minibatch_size<=len(dataset) - self.current = -self.minibatch_size - self.fieldnames = fieldnames - - def __iter__(self): - return self - - def next(self): - self.current+=self.minibatch_size - if self.current>=len(self.dataset): - self.current=-self.minibatch_size - raise StopIteration - if self.minibatch_size==1: - complete_example=self.dataset[self.current] - else: - complete_example=self.dataset[self.current:self.current+self.minibatch_size] - if self.fieldnames: - return Example(self.fieldnames,list(complete_example)) - else: - return complete_example - + raise AbstractFunction() # we may want ArrayDataSet defined in another python file import numpy +def as_array_dataset(dataset): + # Generally datasets can be efficient by making data fields overlap, but + # this function doesn't know which fields overlap. So, it should check if + # dataset supports an as_array_dataset member function, and return that if + # possible. + if hasattr(dataset, 'as_array_dataset'): + return dataset.as_array_dataset() + + raise NotImplementedError() + + # Make ONE big minibatch with all the examples, to separate the fields. + n_examples = len(dataset) + batch = dataset.minibatches( minibatch_size = len(dataset)).next() + + # Each field of the underlying dataset must be convertible to a numpy array of the same type + # currently just double, but should use the smallest compatible dtype + n_fields = len(batch) + fieldnames = batch.fields.keys() + total_width = 0 + type = None + fields = LookupList() + for i in xrange(n_fields): + field = array(batch[i]) + assert field.shape[0]==n_examples + width = field.shape[1] + start=total_width + total_width += width + fields[fieldnames[i]]=slice(start,total_width,1) + # many complicated things remain to be done: + # - find common dtype + # - decide what to do with extra dimensions if not the same in all fields + # - try to see if we can avoid the copy? + class ArrayDataSet(FiniteDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of named fields @@ -246,43 +287,79 @@ by the numpy.array(dataset) call. """ - def __init__(self,dataset=None,data=None,fields={}): + class Iterator(object): + """An iterator over a finite dataset that implements wrap-around""" + def __init__(self, dataset, fieldnames, minibatch_size, next_max): + self.dataset=dataset + self.fieldnames = fieldnames + self.minibatch_size=minibatch_size + self.next_count = 0 + self.next_max = next_max + self.current = -self.minibatch_size + assert minibatch_size > 0 + if minibatch_size >= len(dataset): + raise NotImplementedError() + + def __iter__(self): + #Why do we do this? -JB + return self + + @staticmethod + def matcat(a, b): + a0, a1 = a.shape + b0, b1 = b.shape + assert a1 == b1 + assert a.dtype is b.dtype + rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype) + rval[:a0,:] = a + rval[a0:,:] = b + return rval + + def next(self): + + #check for end-of-loop + self.next_count += 1 + if self.next_count == self.next_max: + raise StopIteration + + #determine the first and last elements of the slice we'll return + self.current += self.minibatch_size + if self.current >= len(self.dataset): + self.current -= len(self.dataset) + upper = self.current + self.minibatch_size + + if upper <= len(self.dataset): + #this is the easy case, we only need once slice + dataview = self.dataset.data[self.current:upper] + else: + # the minibatch wraps around the end of the dataset + dataview = self.dataset.data[self.current:] + upper -= len(self.dataset) + assert upper > 0 + dataview = self.matcat(dataview, self.dataset.data[:upper]) + + + rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames] + + if self.fieldnames: + rval = Example(self.fieldnames, rval) + + return rval + + + def __init__(self, data, fields=None): """ There are two ways to construct an ArrayDataSet: (1) from an existing dataset (which may result in a copy of the data in a numpy array), or (2) from a numpy.array (the data argument), along with an optional description - of the fields (dictionary of column slices indexed by field names). + of the fields (a LookupList of column slices indexed by field names). """ - if dataset!=None: - assert data==None and fields=={} - # Make ONE big minibatch with all the examples, to separate the fields. - n_examples=len(dataset) - batch = dataset.minibatches(n_examples).next() - # Each field of the underlying dataset must be convertible to a numpy array of the same type - # currently just double, but should use the smallest compatible dtype - n_fields = len(batch) - fieldnames = batch.fields.keys() - total_width = 0 - type = None - for i in xrange(n_fields): - field = array(batch[i]) - assert field.shape[0]==n_examples - width = field.shape[1] - start=total_width - total_width += width - fields[fieldnames[i]]=slice(start,total_width,1) - # many complicated things remain to be done: - # - find common dtype - # - decide what to do with extra dimensions if not the same in all fields - # - try to see if we can avoid the copy? - raise NotImplementedError - if data!=None: - assert dataset==None - self.data=data - self.fields=fields - self.width = data.shape[1] - for fieldname in fields: - fieldslice=fields[fieldname] + self.data=data + self.fields=fields + rows, cols = data.shape + + if fields: + for fieldname,fieldslice in fields.items(): # make sure fieldslice.start and fieldslice.step are defined start=fieldslice.start step=fieldslice.step @@ -293,7 +370,22 @@ if not fieldslice.start or not fieldslice.step: fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) # and coherent with the data array - assert fieldslice.start>=0 and fieldslice.stop<=self.width + assert fieldslice.start >= 0 and fieldslice.stop <= cols + + def minibatches(self, + fieldnames = DataSet.minibatches_fieldnames, + minibatch_size = DataSet.minibatches_minibatch_size, + n_batches = DataSet.minibatches_n_batches): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. + + If the n_batches is empty, we want to see all the examples possible + for the give minibatch_size. + """ + # substitute the defaults: + if fieldnames is None: fieldnames = self.fieldNames() + if n_batches is None: n_batches = len(self) / minibatch_size + return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) def __getattr__(self,fieldname): """ @@ -312,10 +404,10 @@ for field_slice in self.fields.values(): min_col=min(min_col,field_slice.start) max_col=max(max_col,field_slice.stop) - new_fields={} - for field in self.fields: - new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) - return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) + new_fields=LookupList() + for fieldname,fieldslice in self.fields.items(): + new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step) + return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields) def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" @@ -332,13 +424,13 @@ """ if self.fields: fieldnames,fieldslices=zip(*self.fields.items()) - return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) + return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()]) else: return self.data[i] - def __getslice__(self,*slice_args): + def __getslice__(self,*args): """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" - return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) + return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) def __array__(self): """Return an view of this dataset which is an numpy.ndarray