Mercurial > pylearn
diff dataset.py @ 22:b6b36f65664f
Created virtual sub-classes of DataSet: {Finite{Length,Width},Sliceable}DataSet,
removed .field ability from LookupList (because of setattr problems), removed
fieldNames() from DataSet (but is in FiniteWidthDataSet, where it makes sense),
and added hasFields() instead. Fixed problems in asarray, and tested
previous functionality in _test_dataset.py, but not yet new functionality.
author | bengioy@esprit.iro.umontreal.ca |
---|---|
date | Mon, 07 Apr 2008 20:44:37 -0400 |
parents | 266c68cb6136 |
children | 526e192b0699 |
line wrap: on
line diff
--- a/dataset.py Mon Apr 07 19:32:52 2008 -0400 +++ b/dataset.py Mon Apr 07 20:44:37 2008 -0400 @@ -10,7 +10,7 @@ A DataSet is a generator of iterators; these iterators can run through the examples in a variety of ways. A DataSet need not necessarily have a finite or known length, so this class can be used to interface to a 'stream' which - feeds on-line learning. + feeds on-line learning. To iterate over examples, there are several possibilities: - for example in dataset.zip([field1, field2,field3, ...]) @@ -19,29 +19,49 @@ - for example in dataset Each of these is documented below. - Note: For a dataset of fixed and known length, which can implement item - random-access efficiently (e.g. indexing and slicing), and which can profit - from the FiniteDataSetIterator, consider using base class FiniteDataSet. - Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. Note: The content of a field can be of any type. + Note: A dataset can recognize a potentially infinite number of field names (i.e. the field + values can be computed on-demand, when particular field names are used in one of the + iterators). + + Datasets of finite length should be sub-classes of FiniteLengthDataSet. + + Datasets whose elements can be indexed and sub-datasets of consecutive + examples (i.e. slices) can be extracted from should be sub-classes of + SliceableDataSet. + + Datasets with a finite number of fields should be sub-classes of + FiniteWidthDataSet. """ def __init__(self): pass + class Iter(LookupList): + def __init__(self, ll): + LookupList.__init__(self, ll.keys(), ll.values()) + self.ll = ll + def __iter__(self): #makes for loop work + return self + def next(self): + self.ll.next() + self._values = [v[0] for v in self.ll._values] + return self + def __iter__(self): """Supports the syntax "for i in dataset: ..." Using this syntax, "i" will be an Example instance (or equivalent) with all the fields of DataSet self. Every field of "i" will give access to a field of a single example. Fields should be accessible via - i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free + i["fielname"] or i[3] (in the order defined by the elements of the + Example returned by this iterator), but the derived class is free to accept any type of identifier, and add extra functionality to the iterator. """ - return self.zip(*self.fieldNames()) + return DataSet.Iter(self.minibatches(None, minibatch_size = 1)) def zip(self, *fieldnames): """ @@ -61,17 +81,7 @@ The derived class may accept fieldname arguments of any type. """ - class Iter(LookupList): - def __init__(self, ll): - LookupList.__init__(self, ll.keys(), ll.values()) - self.ll = ll - def __iter__(self): #makes for loop work - return self - def next(self): - self.ll.next() - self._values = [v[0] for v in self.ll._values] - return self - return Iter(self.minibatches(fieldnames, minibatch_size = 1)) + return DataSet.Iter(self.minibatches(fieldnames, minibatch_size = 1)) minibatches_fieldnames = None minibatches_minibatch_size = 1 @@ -81,18 +91,25 @@ minibatch_size = minibatches_minibatch_size, n_batches = minibatches_n_batches): """ - Supports two forms of syntax: + Supports three forms of syntax: + + for i in dataset.minibatches(None,**kwargs): ... for i in dataset.minibatches([f1, f2, f3],**kwargs): ... for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... - Using the first syntax, "i" will be an indexable object, such as a list, - tuple, or Example instance, such that on every iteration, i[0] is a + Using the first two syntaxes, "i" will be an indexable object, such as a list, + tuple, or Example instance. In both cases, i[k] is a list-like container + of a batch of current examples. In the second case, i[0] is list-like container of the f1 field of a batch current examples, i[1] is a list-like container of the f2 field, etc. - Using the second syntax, i1, i2, i3 will be list-like containers of the + Using the first syntax, all the fields will be returned in "i". + Beware that some datasets may not support this syntax, if the number + of fields is infinite (i.e. field values may be computed "on demand"). + + Using the third syntax, i1, i2, i3 will be list-like containers of the f1, f2, and f3 fields of a batch of examples on each loop iteration. PARAMETERS @@ -115,35 +132,15 @@ """ raise AbstractFunction() - - def fieldNames(self): - #Yoshua- - # This list may not be finite; what would make sense in the use you have - # in mind? - # -JB - #James- - # You are right. I had put this to be able to iterate over the fields - # but maybe an iterator mechanism (over fields rather than examples) - # would be more appropriate. Fieldnames are needed in general - # by the iterators over examples or minibatches, to construct - # examples or minibatches with the corresponding names as attributes. - # -YB + + def hasFields(*fieldnames): """ - Return an iterator (an object with an __iter__ method) that - iterates over the names of the fields. As a special cases, - a list or a tuple of field names can be returned. - """" - # Note that some datasets - # may have virtual fields and support a virtually infinite number - # of possible field names. In that case, fieldNames() should - # either raise an error or iterate over a particular set of - # names as appropriate. Another option would be to iterate - # over the sub-datasets comprising a single field at a time. - # I am not sure yet what is most appropriate. - # -YB + Return true if the given field name (or field names, if multiple arguments are + given) is recognized by the DataSet (i.e. can be used as a field name in one + of the iterators). """ raise AbstractFunction() - + def rename(*new_field_specifications): #Yoshua- # Do you mean for this to be a virtual method? @@ -165,7 +162,7 @@ raise AbstractFunction() - def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): + def applyFunction(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True): """ Return a dataset that contains as fields the results of applying the given function (example-wise) to the specified input_fields. The @@ -202,85 +199,43 @@ new_fieldnames = [dct.get(f, f) for f in fieldnames] return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) - def fieldNames(self): - return [dct.get(f, f) for f in self.src.fieldNames()] - - -class FiniteDataSet(DataSet): +class FiniteLengthDataSet(DataSet): """ - Virtual interface, a subclass of DataSet for datasets which have a finite, known length. - Examples are indexed by an integer between 0 and self.length()-1, - and a subdataset can be obtained by slicing. This may not be appropriate in general - but only for datasets which can be thought of like ones that access rows AND fields - in an efficient random access way. Users are encouraged to expect only the generic dataset - interface in general. A FiniteDataSet is mainly useful when one has to obtain - a subset of examples (e.g. for splitting a dataset into training and test sets). + Virtual interface for datasets that have a finite length (number of examples), + and thus recognize a len(dataset) call. """ + def __init__(self): + DataSet.__init__(self) - class FiniteDataSetIterator(object): - """ - If the fieldnames list is empty, it means that we want to see ALL the fields. - """ - def __init__(self,dataset,minibatch_size=1,fieldnames=[]): - self.dataset=dataset - self.minibatch_size=minibatch_size - assert minibatch_size>=1 and minibatch_size<=len(dataset) - self.current = -self.minibatch_size - self.fieldnames = fieldnames - if len(dataset) % minibatch_size: - raise NotImplementedError() - - def __iter__(self): - return self + def __len__(self): + """len(dataset) returns the number of examples in the dataset.""" + raise AbstractFunction() + + +class SliceableDataSet(DataSet): + """ + Virtual interface, a subclass of DataSet for datasets which are sliceable + and whose individual elements can be accessed, generally respecting the + python semantics for [spec], where spec is either a non-negative integer + (for selecting one example), or a python slice (for selecting a sub-dataset + comprising the specified examples). This is useful for obtaining + sub-datasets, e.g. for splitting a dataset into training and test sets. + """ + def __init__(self): + DataSet.__init__(self) - def next(self): - self.current+=self.minibatch_size - if self.current>=len(self.dataset): - self.current=-self.minibatch_size - raise StopIteration - if self.minibatch_size==1: - complete_example=self.dataset[self.current] - else: - complete_example=self.dataset[self.current:self.current+self.minibatch_size] - if self.fieldnames: - return Example(self.fieldnames,list(complete_example)) - else: - return complete_example - - def __init__(self): - pass - def minibatches(self, fieldnames = DataSet.minibatches_fieldnames, minibatch_size = DataSet.minibatches_minibatch_size, n_batches = DataSet.minibatches_n_batches): """ - If the fieldnames list is empty, it means that we want to see ALL the fields. - If the n_batches is empty, we want to see all the examples possible - for the give minibatch_size. + for the given minibatch_size (possibly missing a few at the end of the dataset). """ # substitute the defaults: - if fieldnames is None: fieldnames = self.fieldNames() if n_batches is None: n_batches = len(self) / minibatch_size return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) - def __getattr__(self,fieldname): - """Return an that can iterate over the values of the field in this dataset.""" - return self(fieldname) - - def __call__(self,*fieldnames): - """Return a sub-dataset containing only the given fieldnames as fields. - - The return value's default iterator will iterate only over the given - fields. - """ - raise AbstractFunction() - - def __len__(self): - """len(dataset) returns the number of examples in the dataset.""" - raise AbstractFunction() - def __getitem__(self,i): """dataset[i] returns the (i+1)-th example of the dataset.""" raise AbstractFunction() @@ -289,6 +244,28 @@ """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" raise AbstractFunction() + +class FiniteWidthDataSet(DataSet): + """ + Virtual interface for datasets that have a finite width (number of fields), + and thus return a list of fieldNames. + """ + def __init__(self): + DataSet.__init__(self) + + def hasFields(*fieldnames): + has_fields=True + for fieldname in fieldnames: + if fieldname not in self.fields.keys(): + has_fields=False + return has_fields + + def fieldNames(self): + """Return the list of field names that are supported by the iterators, + and for which hasFields(fieldname) would return True.""" + raise AbstractFunction() + + # we may want ArrayDataSet defined in another python file import numpy @@ -326,7 +303,7 @@ # - decide what to do with extra dimensions if not the same in all fields # - try to see if we can avoid the copy? -class ArrayDataSet(FiniteDataSet): +class ArrayDataSet(FiniteLengthDataSet,FiniteWidthDataSet,SliceableDataSet): """ An ArrayDataSet behaves like a numpy array but adds the notion of named fields from DataSet (and the ability to view the values of multiple fields as an 'Example'). @@ -342,7 +319,8 @@ class Iterator(LookupList): """An iterator over a finite dataset that implements wrap-around""" def __init__(self, dataset, fieldnames, minibatch_size, next_max): - LookupList.__init__(self, fieldnames, [0] * len(fieldnames)) + if fieldnames is None: fieldnames = dataset.fieldNames() + LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) self.dataset=dataset self.minibatch_size=minibatch_size self.next_count = 0 @@ -392,10 +370,8 @@ assert upper > 0 dataview = self.matcat(dataview, data[:upper]) - self._values = [dataview[:, self.dataset.fields[f]]\ for f in self._names] - return self @@ -429,13 +405,12 @@ minibatch_size = DataSet.minibatches_minibatch_size, n_batches = DataSet.minibatches_n_batches): """ - If the fieldnames list is empty, it means that we want to see ALL the fields. + If the fieldnames list is None, it means that we want to see ALL the fields. - If the n_batches is empty, we want to see all the examples possible - for the give minibatch_size. + If the n_batches is None, we want to see all the examples possible + for the given minibatch_size (possibly missing some near the end). """ # substitute the defaults: - if fieldnames is None: fieldnames = self.fieldNames() if n_batches is None: n_batches = len(self) / minibatch_size return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) @@ -462,7 +437,7 @@ return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields) def fieldNames(self): - """Return the list of field names that are supported by getattr and getFields.""" + """Return the list of field names that are supported by getattr and hasField.""" return self.fields.keys() def __len__(self): @@ -502,45 +477,48 @@ return self.data # else, select subsets of columns mapped by the fields columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) + overlapping_fields = False + n_columns = 0 for field_slice in self.fields.values(): for c in xrange(field_slice.start,field_slice.stop,field_slice.step): + n_columns += 1 + if columns_used[c]: overlapping_fields=True columns_used[c]=True # try to figure out if we can map all the slices into one slice: - mappable_to_one_slice = True - start=0 - while start<len(columns_used) and not columns_used[start]: - start+=1 - stop=len(columns_used) - while stop>0 and not columns_used[stop-1]: - stop-=1 - step=0 - i=start - while i<stop: - j=i+1 - while j<stop and not columns_used[j]: - j+=1 - if step: - if step!=j-i: - mappable_to_one_slice = False - break - else: - step = j-i - i=j + mappable_to_one_slice = not overlapping_fields + if not overlapping_fields: + start=0 + while start<len(columns_used) and not columns_used[start]: + start+=1 + stop=len(columns_used) + while stop>0 and not columns_used[stop-1]: + stop-=1 + step=0 + i=start + while i<stop: + j=i+1 + while j<stop and not columns_used[j]: + j+=1 + if step: + if step!=j-i: + mappable_to_one_slice = False + break + else: + step = j-i + i=j if mappable_to_one_slice: return self.data[:,slice(start,stop,step)] - # else make contiguous copy - n_columns = sum(columns_used) - result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) - print result.shape + # else make contiguous copy (copying the overlapping columns) + result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) c=0 for field_slice in self.fields.values(): - slice_width=field_slice.stop-field_slice.start/field_slice.step + slice_width=(field_slice.stop-field_slice.start)/field_slice.step # copy the field here - result[:,slice(c,slice_width)]=self.data[:,field_slice] + result[:,slice(c,c+slice_width)]=self.data[:,field_slice] c+=slice_width return result -class ApplyFunctionDataset(DataSet): +class ApplyFunctionDataSet(DataSet): """ A dataset that contains as fields the results of applying a given function (example-wise) to specified input_fields of a source @@ -583,17 +561,17 @@ fieldnames = DataSet.minibatches_fieldnames, minibatch_size = DataSet.minibatches_minibatch_size, n_batches = DataSet.minibatches_n_batches): - + class Iterator(LookupList): def __init__(self,dataset): - LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) + if fieldnames is None: + LookupList.__init__(self, [],[]) + else: + LookupList.__init__(self, fieldnames, [0]*len(fieldnames)) self.dataset=dataset - if dataset.copy_inputs: - src_fields=dataset.fieldNames() - else: - src_fields=dataset.input_fields - self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches) + self.src_iterator=self.src.minibatches(list(set.union(set(fieldnames),set(self.dataset.input_fields))), + minibatch_size,n_batches) def __iter__(self): return self @@ -603,12 +581,15 @@ if self.dataset.copy_inputs: function_inputs = src_examples else: - function_inputs = - [src_examples[field_name] for field_name in self.dataset.input_fields]) - return self.dataset.function(*function_inputs) + function_inputs = [src_examples[field_name] for field_name in self.dataset.input_fields] + outputs = Example(self.dataset.output_fields,self.dataset.function(*function_inputs)) + if self.dataset.copy_inputs: + return src_examples + outputs + else: + return outputs for fieldname in fieldnames: - assert fieldname in self.input_fields + assert fieldname in self.output_fields or self.src.hasFields(fieldname) return Iterator(self)