# HG changeset patch # User bengioy@esprit.iro.umontreal.ca # Date 1206558090 14400 # Node ID be128b9127c8cb09bd2fa80845cb1e765006c63f # Parent 80bf5492e571d48cc7cfc5c0b914fb4d9fd9ea2a Debugged (to the extent of my tests) the new version of dataset diff -r 80bf5492e571 -r be128b9127c8 _test_dataset.py --- a/_test_dataset.py Tue Mar 25 11:39:02 2008 -0400 +++ b/_test_dataset.py Wed Mar 26 15:01:30 2008 -0400 @@ -13,7 +13,7 @@ numpy.random.seed(123456) def test0(self): - a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)},minibatch_size=1) + a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)}) s=0 for example in a: s+=_sum_all(example.x) @@ -21,13 +21,12 @@ self.failUnless(abs(s-7.25967597)<1e-6) def test1(self): - a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)},minibatch_size=1) - a.minibatch_size=2 + a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)}) s=0 - for mb in a: + for mb in a.minibatches(2): s+=_sum_all(numpy.array(mb)) s+=a[3:6].x[1,1] - for mb in ArrayDataSet(data=a.y,minibatch_size=2): + for mb in ArrayDataSet(data=a.y).minibatches(2): for e in mb: s+=sum(e) #print numpy.array(a) diff -r 80bf5492e571 -r be128b9127c8 dataset.py --- a/dataset.py Tue Mar 25 11:39:02 2008 -0400 +++ b/dataset.py Wed Mar 26 15:01:30 2008 -0400 @@ -1,60 +1,139 @@ + +class Example(object): + """ + An example is something that is like a tuple but whose elements can be named, to that + following syntactic constructions work as one would expect: + example.x = [1, 2, 3] # set a field + x, y, z = example + x = example[0] + x = example["x"] + """ + def __init__(self,names,values): + assert len(values)==len(names) + self.__dict__['values']=values + self.__dict__['fields']={} + for i in xrange(len(values)): + self.fields[names[i]]=i + + def __getitem__(self,i): + if isinstance(i,int): + return self.values[i] + else: + return self.values[self.fields[i]] + + def __setitem__(self,i,value): + if isinstance(i,int): + self.values[i]=value + else: + self.values[self.fields[i]]=value + + def __getattr__(self,name): + return self.values[self.fields[name]] + + def __setattr__(self,name,value): + self.values[self.fields[name]]=value + + def __len__(self): + return len(self.values) class DataSet(object): """ This is a virtual base class or interface for datasets. - A dataset is basically an iterator over examples. It does not necessarily + A dataset is basically an iterator over Examples (or anything that + behaves like an Example). It does not necessarily have a fixed length (this is useful for 'streams' which feed on-line learning). - Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. - Examples and datasets optionally have named fields. - One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). + Datasets with fixed and known length are instances of FiniteDataSet, a subclass + which supports indexing (dataset[i]) and slicing (dataset[1000:2000]). + To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...) + method which returns an iterator over only the desired fields. Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. The content of a field can be of any type, but often will be a numpy array. - The minibatch_size attribute, if different than 1, means that the iterator (next() method) - returns not a single example but an array of length minibatch_size, i.e., an indexable - object with minibatch_size examples in it. + If one iterates through minibatches of examples (with the minibatches() method + or with the minibatch_size argument of the zip() method), then the fields + returned by the iterator's next method should be iterators over the + individual values within the minibatch (typically these will be arrays + with minibatch_size rows). """ - def __init__(self,minibatch_size=1): - assert minibatch_size>0 - self.minibatch_size=minibatch_size - + def __init__(self): + pass + def __iter__(self): """ Return an iterator, whose next() method returns the next example or the next - minibatch in the dataset. A minibatch (of length > 1) should be something one - can iterate on again in order to obtain the individual examples. If the dataset - has fields, then the example or the minibatch must have the same fields - (typically this is implemented by returning another smaller dataset, when - there are fields). + minibatch in the dataset. A minibatch (of length > 1) is also an example, but + whose fields should be something one can iterate on again in order to obtain + the individual examples. + """ + raise NotImplementedError + + def zip(self,*fieldnames): + """ + Return an iterator which sees only the specified fields (each fieldname is a + field key, typically a string). The value returned at each iteration + is a tuple with one element per field. Hence it can be used like this: + for f1, f2, f3 in dataset.zip('field1','field2','field3'): + ... use f1, f2, and f3 """ raise NotImplementedError + def minibatches(self,minibatch_size,*fieldnames): + """ + Similar to zip but iterate over minibatches. + Return a minibatch iterator, whose next() method returns an 'example' + whose fields are iteratable objects (which can iterate over the individual + values of that field in the minibatch). + """ + raise NotImplementedError + + def fieldNames(self): + """Return the list of field names in the examples of this dataset.""" + raise NotImplementedError + + def rename(*new_field_specifications): + """ + Return a new dataset that maps old fields (of self) to new fields (of the returned + dataset). The minimal syntax that should be supported is the following: + new_field_specifications = [new_field_spec1, new_field_spec2, ...] + new_field_spec = ([old_field1, old_field2, ...], new_field) + In general both old_field and new_field should be strings, but some datasets may also + support additional indexing schemes within each field (e.g. column slice + of a matrix-like field). + """ + raise NotImplementedError + +class FiniteDataSet(DataSet): + """ + Virtual interface, a subclass of DataSet for datasets which have a finite, known length. + Examples are indexed by an integer between 0 and self.length()-1, + and a subdataset can be obtained by slicing. This may not be appropriate in general + but only for datasets which can be thought of like ones that access rows AND fields + in an efficient random access way. Users are encouraged to expect only the generic dataset + interface in general. A FiniteDataSet is mainly useful when one has to obtain + a subset of examples (e.g. for splitting a dataset into training and test sets). + """ + + def __init__(self): + pass + + def __iter__(self): + return FiniteDataSetIterator(self) + + def zip(self,*fieldnames): + return FiniteDataSetIterator(self,1,fieldnames) + + def minibatches(self,minibatch_size,*fieldnames): + return FiniteDataSetIterator(self,minibatch_size,fieldnames) + def __getattr__(self,fieldname): - """Return a sub-dataset containing only the given fieldname as field.""" + """Return an that can iterate over the values of the field in this dataset.""" return self(fieldname) def __call__(self,*fieldnames): """Return a sub-dataset containing only the given fieldnames as fields.""" raise NotImplementedError - def fieldNames(self): - """Return the list of field names that are supported by getattr and getFields.""" - raise NotImplementedError - -class FiniteDataSet(DataSet): - """ - Virtual interface, a subclass of DataSet for datasets which have a finite, known length. - Examples are indexed by an integer between 0 and self.length()-1, - and a subdataset can be obtained by slicing. - """ - - def __init__(self,minibatch_size): - DataSet.__init__(self,minibatch_size) - - def __iter__(self): - return FiniteDataSetIterator(self) - def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise NotImplementedError @@ -68,32 +147,32 @@ raise NotImplementedError class FiniteDataSetIterator(object): - def __init__(self,dataset): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. + """ + def __init__(self,dataset,minibatch_size=1,fieldnames=[]): self.dataset=dataset - self.current = -self.dataset.minibatch_size - + self.minibatch_size=minibatch_size + assert minibatch_size>=1 and minibatch_size<=len(dataset) + self.current = -self.minibatch_size + self.fieldnames = fieldnames + + def __iter__(self): + return self + def next(self): - """ - Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that - many examples. If the dataset has fields, the example or the minibatch of examples - is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), - but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate - example-wise on it. On the other hand, if the dataset has no fields (e.g. because - it is already the field of a bigger dataset), then the returned example or minibatch - may be any indexable object, such as a numpy array. Following the array semantics of indexing - and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array - with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding - to a row. Again, if the minibatch_size is >1, one can iterate on the result to - obtain individual examples (as rows). - """ - self.current+=self.dataset.minibatch_size + self.current+=self.minibatch_size if self.current>=len(self.dataset): - self.current=-self.dataset.minibatch_size + self.current=-self.minibatch_size raise StopIteration - if self.dataset.minibatch_size==1: - return self.dataset[self.current] + if self.minibatch_size==1: + complete_example=self.dataset[self.current] else: - return self.dataset[self.current:self.current+self.dataset.minibatch_size] + complete_example=self.dataset[self.current:self.current+self.minibatch_size] + if self.fieldnames: + return Example(self.fieldnames,list(complete_example)) + else: + return complete_example # we may want ArrayDataSet defined in another python file @@ -102,27 +181,46 @@ class ArrayDataSet(FiniteDataSet): """ - An ArrayDataSet behaves like a numpy array but adds the notion of fields - and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset + An ArrayDataSet behaves like a numpy array but adds the notion of named fields + from DataSet (and the ability to view multiple field values as an 'Example'). + It is a fixed-length and fixed-width dataset in which each element is a numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields must correspond to a slice of array columns. If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. Any dataset can also be converted to a numpy array (losing the notion of fields - and of minibatch_size) by the numpy.array(dataset) call. + by the numpy.array(dataset) call. """ - def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): + def __init__(self,dataset=None,data=None,fields={}): """ - There are two ways to construct an ArrayDataSet: (1) from an - existing dataset (which may result in a copy of the data in a numpy array), - or (2) from a numpy.array (the data argument), along with an optional description - of the fields (dictionary of column slices indexed by field names). + There are two ways to construct an ArrayDataSet: (1) from an + existing dataset (which may result in a copy of the data in a numpy array), + or (2) from a numpy.array (the data argument), along with an optional description + of the fields (dictionary of column slices indexed by field names). """ - FiniteDataSet.__init__(self,minibatch_size) if dataset!=None: assert data==None and fields=={} - # convert dataset to an ArrayDataSet + # Make ONE big minibatch with all the examples, to separate the fields. + n_examples=len(dataset) + batch = dataset.minibatches(n_examples).next() + # Each field of the underlying dataset must be convertible to a numpy array of the same type + # currently just double, but should use the smallest compatible dtype + n_fields = len(batch) + fieldnames = batch.fields.keys() + total_width = 0 + type = None + for i in xrange(n_fields): + field = array(batch[i]) + assert field.shape[0]==n_examples + width = field.shape[1] + start=total_width + total_width += width + fields[fieldnames[i]]=slice(start,total_width,1) + # many complicated things remain to be done: + # - find common dtype + # - decide what to do with extra dimensions if not the same in all fields + # - try to see if we can avoid the copy? raise NotImplementedError if data!=None: assert dataset==None @@ -142,13 +240,12 @@ fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) # and coherent with the data array assert fieldslice.start>=0 and fieldslice.stop<=self.width - assert minibatch_size<=len(self.data) def __getattr__(self,fieldname): """ Return a numpy array with the content associated with the given field name. If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension - than the dataset.data) is returned. + than the dataset itself) is returned. """ if len(self.data)==1: return self.data[0,self.fields[fieldname]] @@ -164,7 +261,7 @@ new_fields={} for field in self.fields: new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) - return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) + return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" @@ -176,13 +273,12 @@ def __getitem__(self,i): """ - dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields - then a one-example dataset is returned (to be able to handle example.field accesses). + dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields + the result is just a numpy array (for the i-th row of the dataset data matrix). """ if self.fields: - if isinstance(i,slice): - return ArrayDataSet(data=data[slice],fields=self.fields) - return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) + fieldnames,fieldslices=zip(*self.fields.items()) + return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) else: return self.data[i] @@ -232,3 +328,5 @@ result[:,slice(c,slice_width)]=self.data[:,field_slice] c+=slice_width return result + +