# HG changeset patch # User bergstrj@iro.umontreal.ca # Date 1206570224 14400 # Node ID 813723310d75d13eb012090cd64bf98c2ce6a7a6 # Parent 88168361a5ab12a2cf52598c187283d562ba6e04# Parent be128b9127c8cb09bd2fa80845cb1e765006c63f commenting diff -r 88168361a5ab -r 813723310d75 _test_dataset.py --- a/_test_dataset.py Tue Mar 25 13:38:51 2008 -0400 +++ b/_test_dataset.py Wed Mar 26 18:23:44 2008 -0400 @@ -13,7 +13,7 @@ numpy.random.seed(123456) def test0(self): - a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)},minibatch_size=1) + a=ArrayDataSet(data=numpy.random.rand(8,3),fields={"x":slice(2),"y":slice(1,3)}) s=0 for example in a: s+=_sum_all(example.x) @@ -21,13 +21,12 @@ self.failUnless(abs(s-7.25967597)<1e-6) def test1(self): - a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)},minibatch_size=1) - a.minibatch_size=2 + a=ArrayDataSet(data=numpy.random.rand(10,4),fields={"x":slice(2),"y":slice(1,4)}) s=0 - for mb in a: + for mb in a.minibatches(2): s+=_sum_all(numpy.array(mb)) s+=a[3:6].x[1,1] - for mb in ArrayDataSet(data=a.y,minibatch_size=2): + for mb in ArrayDataSet(data=a.y).minibatches(2): for e in mb: s+=sum(e) #print numpy.array(a) diff -r 88168361a5ab -r 813723310d75 dataset.py --- a/dataset.py Tue Mar 25 13:38:51 2008 -0400 +++ b/dataset.py Wed Mar 26 18:23:44 2008 -0400 @@ -1,60 +1,193 @@ + +class Example(object): + """ + An example is something that is like a tuple but whose elements can be named, to that + following syntactic constructions work as one would expect: + example.x = [1, 2, 3] # set a field + x, y, z = example + x = example[0] + x = example["x"] + """ + def __init__(self,names,values): + assert len(values)==len(names) + self.__dict__['values']=values + self.__dict__['fields']={} + for i in xrange(len(values)): + self.fields[names[i]]=i + + def __getitem__(self,i): + if isinstance(i,int): + return self.values[i] + else: + return self.values[self.fields[i]] + + def __setitem__(self,i,value): + if isinstance(i,int): + self.values[i]=value + else: + self.values[self.fields[i]]=value + + def __getattr__(self,name): + return self.values[self.fields[name]] + + def __setattr__(self,name,value): + self.values[self.fields[name]]=value + + def __len__(self): + return len(self.values) class DataSet(object): - """ - This is a virtual base class or interface for datasets. - A dataset is basically an iterator over examples. It does not necessarily - have a fixed length (this is useful for 'streams' which feed on-line learning). - Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. - Examples and datasets optionally have named fields. - One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). - Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. - The content of a field can be of any type, but often will be a numpy array. - The minibatch_size attribute, if different than 1, means that the iterator (next() method) - returns not a single example but an array of length minibatch_size, i.e., an indexable - object with minibatch_size examples in it. + """A virtual base class for datasets. + + A DataSet is a generator of iterators; these iterators can run through the + examples in a variety of ways. A DataSet need not necessarily have a finite + or known length, so this class can be used to interface to a 'stream' which + feed on-line learning. + + To iterate over examples, there are several possibilities: + - for i in dataset.zip(field1, field2,field3, ...) + - for i in dataset.minibatches(N, field1, field2, ...) + - for i in dataset + Each of these is documented below. + + Note: For a dataset of fixed and known length, which can implement item + random-access efficiently (e.g. indexing and slicing), and which can profit + from the FiniteDataSetIterator, consider using base class FiniteDataSet. + + Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. + + Note: The content of a field can be of any type. + """ - def __init__(self,minibatch_size=1): - assert minibatch_size>0 - self.minibatch_size=minibatch_size + def __init__(self): + pass + + def __iter__(self): + """Supports the syntax "for i in dataset: ..." - def __iter__(self): + Using this syntax, "i" will be an Example instance (or equivalent) with + all the fields of DataSet self. Every field of "i" will give access to + a the field of a single example. Fields should be accessible via + i[identifier], but the derived class is free to accept any type of + identifier, and add extra functionality to the iterator. + """ + raise NotImplementedError + + def zip(self, *fieldnames): """ - Return an iterator, whose next() method returns the next example or the next - minibatch in the dataset. A minibatch (of length > 1) should be something one - can iterate on again in order to obtain the individual examples. If the dataset - has fields, then the example or the minibatch must have the same fields - (typically this is implemented by returning another smaller dataset, when - there are fields). + Supports two forms of syntax: + + for i in dataset.zip(f1, f2, f3): ... + + for i1, i2, i3 in dataset.zip(f1, f2, f3): ... + + Using the first syntax, "i" will be an indexable object, such as a list, + tuple, or Example instance, such that on every iteration, i[0] is the f1 + field of the current example, i[1] is the f2 field, and so on. + + Using the second syntax, i1, i2, i3 will contain the the contents of the + f1, f2, and f3 fields of a single example on each loop iteration. + + The derived class may accept fieldname arguments of any type. + """ raise NotImplementedError - def __getattr__(self,fieldname): - """Return a sub-dataset containing only the given fieldname as field.""" - return self(fieldname) + def minibatches(self,minibatch_size,*fieldnames): + """ + Supports two forms of syntax: + + for i in dataset.zip(f1, f2, f3): ... + + for i1, i2, i3 in dataset.zip(f1, f2, f3): ... + + Using the first syntax, "i" will be an indexable object, such as a list, + tuple, or Example instance, such that on every iteration, i[0] is the f1 + field of the current example, i[1] is the f2 field, and so on. + + Using the second syntax, i1, i2, i3 will contain the the contents of the + f1, f2, and f3 fields of a single example on each loop iteration. + + The derived class may accept fieldname arguments of any type. + + Return an iterator, whose next() method returns the next example or the next + minibatch in the dataset. A minibatch (of length > 1) is also an example, but + whose fields should be something one can iterate on again in order to obtain + the individual examples. - def __call__(self,*fieldnames): - """Return a sub-dataset containing only the given fieldnames as fields.""" + DataSet.zip returns an iterator over only the desired fields, and each field + of the iterator contains one example. + + Return an iterator which sees only the specified fields (each fieldname is a + field key, typically a string). The value returned at each iteration + is a tuple with one element per field. Hence it can be used like this: + for f1, f2, f3 in dataset.zip('field1','field2','field3'): + ... use f1, f2, and f3 + If one iterates through minibatches of examples (with the minibatches() method + or with the minibatch_size argument of the zip() method), then the fields + returned by the iterator's next method should be iterators over the + individual values within the minibatch (typically these will be arrays + with minibatch_size rows). + Similar to zip but iterate over minibatches. + Return a minibatch iterator, whose next() method returns an 'example' + whose fields are iteratable objects (which can iterate over the individual + values of that field in the minibatch). + """ + raise NotImplementedError + + def fieldNames(self): + """Return the list of field names in the examples of this dataset.""" raise NotImplementedError - def fieldNames(self): - """Return the list of field names that are supported by getattr and getFields.""" + def rename(*new_field_specifications): + """ + Return a new dataset that maps old fields (of self) to new fields (of the returned + dataset). The minimal syntax that should be supported is the following: + new_field_specifications = [new_field_spec1, new_field_spec2, ...] + new_field_spec = ([old_field1, old_field2, ...], new_field) + In general both old_field and new_field should be strings, but some datasets may also + support additional indexing schemes within each field (e.g. column slice + of a matrix-like field). + """ raise NotImplementedError class FiniteDataSet(DataSet): """ Virtual interface, a subclass of DataSet for datasets which have a finite, known length. Examples are indexed by an integer between 0 and self.length()-1, - and a subdataset can be obtained by slicing. + and a subdataset can be obtained by slicing. This may not be appropriate in general + but only for datasets which can be thought of like ones that access rows AND fields + in an efficient random access way. Users are encouraged to expect only the generic dataset + interface in general. A FiniteDataSet is mainly useful when one has to obtain + a subset of examples (e.g. for splitting a dataset into training and test sets). """ - def __init__(self,minibatch_size): - DataSet.__init__(self,minibatch_size) + def __init__(self): + pass def __iter__(self): return FiniteDataSetIterator(self) + def zip(self,*fieldnames): + return FiniteDataSetIterator(self,1,fieldnames) + + def minibatches(self,minibatch_size,*fieldnames): + return FiniteDataSetIterator(self,minibatch_size,fieldnames) + + def __getattr__(self,fieldname): + """Return an that can iterate over the values of the field in this dataset.""" + return self(fieldname) + + def __call__(self,*fieldnames): + """Return a sub-dataset containing only the given fieldnames as fields. + + The return value's default iterator will iterate only over the given + fields. + """ + raise NotImplementedError + def __len__(self): """len(dataset) returns the number of examples in the dataset.""" raise NotImplementedError @@ -68,32 +201,32 @@ raise NotImplementedError class FiniteDataSetIterator(object): - def __init__(self,dataset): + """ + If the fieldnames list is empty, it means that we want to see ALL the fields. + """ + def __init__(self,dataset,minibatch_size=1,fieldnames=[]): self.dataset=dataset - self.current = -self.dataset.minibatch_size - + self.minibatch_size=minibatch_size + assert minibatch_size>=1 and minibatch_size<=len(dataset) + self.current = -self.minibatch_size + self.fieldnames = fieldnames + + def __iter__(self): + return self + def next(self): - """ - Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that - many examples. If the dataset has fields, the example or the minibatch of examples - is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), - but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate - example-wise on it. On the other hand, if the dataset has no fields (e.g. because - it is already the field of a bigger dataset), then the returned example or minibatch - may be any indexable object, such as a numpy array. Following the array semantics of indexing - and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array - with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding - to a row. Again, if the minibatch_size is >1, one can iterate on the result to - obtain individual examples (as rows). - """ - self.current+=self.dataset.minibatch_size + self.current+=self.minibatch_size if self.current>=len(self.dataset): - self.current=-self.dataset.minibatch_size + self.current=-self.minibatch_size raise StopIteration - if self.dataset.minibatch_size==1: - return self.dataset[self.current] + if self.minibatch_size==1: + complete_example=self.dataset[self.current] else: - return self.dataset[self.current:self.current+self.dataset.minibatch_size] + complete_example=self.dataset[self.current:self.current+self.minibatch_size] + if self.fieldnames: + return Example(self.fieldnames,list(complete_example)) + else: + return complete_example # we may want ArrayDataSet defined in another python file @@ -102,27 +235,46 @@ class ArrayDataSet(FiniteDataSet): """ - An ArrayDataSet behaves like a numpy array but adds the notion of fields - and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset + An ArrayDataSet behaves like a numpy array but adds the notion of named fields + from DataSet (and the ability to view multiple field values as an 'Example'). + It is a fixed-length and fixed-width dataset in which each element is a numpy array or a number, hence the whole dataset corresponds to a numpy array. Fields must correspond to a slice of array columns. If the dataset has fields, each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. Any dataset can also be converted to a numpy array (losing the notion of fields - and of minibatch_size) by the numpy.array(dataset) call. + by the numpy.array(dataset) call. """ - def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): + def __init__(self,dataset=None,data=None,fields={}): """ - There are two ways to construct an ArrayDataSet: (1) from an - existing dataset (which may result in a copy of the data in a numpy array), - or (2) from a numpy.array (the data argument), along with an optional description - of the fields (dictionary of column slices indexed by field names). + There are two ways to construct an ArrayDataSet: (1) from an + existing dataset (which may result in a copy of the data in a numpy array), + or (2) from a numpy.array (the data argument), along with an optional description + of the fields (dictionary of column slices indexed by field names). """ - FiniteDataSet.__init__(self,minibatch_size) if dataset!=None: assert data==None and fields=={} - # convert dataset to an ArrayDataSet + # Make ONE big minibatch with all the examples, to separate the fields. + n_examples=len(dataset) + batch = dataset.minibatches(n_examples).next() + # Each field of the underlying dataset must be convertible to a numpy array of the same type + # currently just double, but should use the smallest compatible dtype + n_fields = len(batch) + fieldnames = batch.fields.keys() + total_width = 0 + type = None + for i in xrange(n_fields): + field = array(batch[i]) + assert field.shape[0]==n_examples + width = field.shape[1] + start=total_width + total_width += width + fields[fieldnames[i]]=slice(start,total_width,1) + # many complicated things remain to be done: + # - find common dtype + # - decide what to do with extra dimensions if not the same in all fields + # - try to see if we can avoid the copy? raise NotImplementedError if data!=None: assert dataset==None @@ -142,13 +294,12 @@ fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) # and coherent with the data array assert fieldslice.start>=0 and fieldslice.stop<=self.width - assert minibatch_size<=len(self.data) def __getattr__(self,fieldname): """ Return a numpy array with the content associated with the given field name. If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension - than the dataset.data) is returned. + than the dataset itself) is returned. """ if len(self.data)==1: return self.data[0,self.fields[fieldname]] @@ -164,7 +315,7 @@ new_fields={} for field in self.fields: new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) - return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) + return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) def fieldNames(self): """Return the list of field names that are supported by getattr and getFields.""" @@ -176,13 +327,12 @@ def __getitem__(self,i): """ - dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields - then a one-example dataset is returned (to be able to handle example.field accesses). + dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields + the result is just a numpy array (for the i-th row of the dataset data matrix). """ if self.fields: - if isinstance(i,slice): - return ArrayDataSet(data=data[slice],fields=self.fields) - return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) + fieldnames,fieldslices=zip(*self.fields.items()) + return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) else: return self.data[i] @@ -241,3 +391,5 @@ result[:,slice(c,slice_width)]=self.data[:,field_slice] c+=slice_width return result + +