comparison dataset.py @ 11:be128b9127c8

Debugged (to the extent of my tests) the new version of dataset
author bengioy@esprit.iro.umontreal.ca
date Wed, 26 Mar 2008 15:01:30 -0400
parents de616c423dbd
children ff4e551490f1 813723310d75
comparison
equal deleted inserted replaced
10:80bf5492e571 11:be128b9127c8
1
2 class Example(object):
3 """
4 An example is something that is like a tuple but whose elements can be named, to that
5 following syntactic constructions work as one would expect:
6 example.x = [1, 2, 3] # set a field
7 x, y, z = example
8 x = example[0]
9 x = example["x"]
10 """
11 def __init__(self,names,values):
12 assert len(values)==len(names)
13 self.__dict__['values']=values
14 self.__dict__['fields']={}
15 for i in xrange(len(values)):
16 self.fields[names[i]]=i
17
18 def __getitem__(self,i):
19 if isinstance(i,int):
20 return self.values[i]
21 else:
22 return self.values[self.fields[i]]
23
24 def __setitem__(self,i,value):
25 if isinstance(i,int):
26 self.values[i]=value
27 else:
28 self.values[self.fields[i]]=value
29
30 def __getattr__(self,name):
31 return self.values[self.fields[name]]
32
33 def __setattr__(self,name,value):
34 self.values[self.fields[name]]=value
35
36 def __len__(self):
37 return len(self.values)
1 38
2 39
3 class DataSet(object): 40 class DataSet(object):
4 """ 41 """
5 This is a virtual base class or interface for datasets. 42 This is a virtual base class or interface for datasets.
6 A dataset is basically an iterator over examples. It does not necessarily 43 A dataset is basically an iterator over Examples (or anything that
44 behaves like an Example). It does not necessarily
7 have a fixed length (this is useful for 'streams' which feed on-line learning). 45 have a fixed length (this is useful for 'streams' which feed on-line learning).
8 Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. 46 Datasets with fixed and known length are instances of FiniteDataSet, a subclass
9 Examples and datasets optionally have named fields. 47 which supports indexing (dataset[i]) and slicing (dataset[1000:2000]).
10 One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). 48 To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...)
49 method which returns an iterator over only the desired fields.
11 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. 50 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
12 The content of a field can be of any type, but often will be a numpy array. 51 The content of a field can be of any type, but often will be a numpy array.
13 The minibatch_size attribute, if different than 1, means that the iterator (next() method) 52 If one iterates through minibatches of examples (with the minibatches() method
14 returns not a single example but an array of length minibatch_size, i.e., an indexable 53 or with the minibatch_size argument of the zip() method), then the fields
15 object with minibatch_size examples in it. 54 returned by the iterator's next method should be iterators over the
16 """ 55 individual values within the minibatch (typically these will be arrays
17 56 with minibatch_size rows).
18 def __init__(self,minibatch_size=1): 57 """
19 assert minibatch_size>0 58
20 self.minibatch_size=minibatch_size 59 def __init__(self):
21 60 pass
61
22 def __iter__(self): 62 def __iter__(self):
23 """ 63 """
24 Return an iterator, whose next() method returns the next example or the next 64 Return an iterator, whose next() method returns the next example or the next
25 minibatch in the dataset. A minibatch (of length > 1) should be something one 65 minibatch in the dataset. A minibatch (of length > 1) is also an example, but
26 can iterate on again in order to obtain the individual examples. If the dataset 66 whose fields should be something one can iterate on again in order to obtain
27 has fields, then the example or the minibatch must have the same fields 67 the individual examples.
28 (typically this is implemented by returning another smaller dataset, when 68 """
29 there are fields). 69 raise NotImplementedError
30 """ 70
31 raise NotImplementedError 71 def zip(self,*fieldnames):
72 """
73 Return an iterator which sees only the specified fields (each fieldname is a
74 field key, typically a string). The value returned at each iteration
75 is a tuple with one element per field. Hence it can be used like this:
76 for f1, f2, f3 in dataset.zip('field1','field2','field3'):
77 ... use f1, f2, and f3
78 """
79 raise NotImplementedError
80
81 def minibatches(self,minibatch_size,*fieldnames):
82 """
83 Similar to zip but iterate over minibatches.
84 Return a minibatch iterator, whose next() method returns an 'example'
85 whose fields are iteratable objects (which can iterate over the individual
86 values of that field in the minibatch).
87 """
88 raise NotImplementedError
89
90 def fieldNames(self):
91 """Return the list of field names in the examples of this dataset."""
92 raise NotImplementedError
93
94 def rename(*new_field_specifications):
95 """
96 Return a new dataset that maps old fields (of self) to new fields (of the returned
97 dataset). The minimal syntax that should be supported is the following:
98 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
99 new_field_spec = ([old_field1, old_field2, ...], new_field)
100 In general both old_field and new_field should be strings, but some datasets may also
101 support additional indexing schemes within each field (e.g. column slice
102 of a matrix-like field).
103 """
104 raise NotImplementedError
105
106 class FiniteDataSet(DataSet):
107 """
108 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
109 Examples are indexed by an integer between 0 and self.length()-1,
110 and a subdataset can be obtained by slicing. This may not be appropriate in general
111 but only for datasets which can be thought of like ones that access rows AND fields
112 in an efficient random access way. Users are encouraged to expect only the generic dataset
113 interface in general. A FiniteDataSet is mainly useful when one has to obtain
114 a subset of examples (e.g. for splitting a dataset into training and test sets).
115 """
116
117 def __init__(self):
118 pass
119
120 def __iter__(self):
121 return FiniteDataSetIterator(self)
122
123 def zip(self,*fieldnames):
124 return FiniteDataSetIterator(self,1,fieldnames)
125
126 def minibatches(self,minibatch_size,*fieldnames):
127 return FiniteDataSetIterator(self,minibatch_size,fieldnames)
32 128
33 def __getattr__(self,fieldname): 129 def __getattr__(self,fieldname):
34 """Return a sub-dataset containing only the given fieldname as field.""" 130 """Return an that can iterate over the values of the field in this dataset."""
35 return self(fieldname) 131 return self(fieldname)
36 132
37 def __call__(self,*fieldnames): 133 def __call__(self,*fieldnames):
38 """Return a sub-dataset containing only the given fieldnames as fields.""" 134 """Return a sub-dataset containing only the given fieldnames as fields."""
39 raise NotImplementedError 135 raise NotImplementedError
40 136
41 def fieldNames(self):
42 """Return the list of field names that are supported by getattr and getFields."""
43 raise NotImplementedError
44
45 class FiniteDataSet(DataSet):
46 """
47 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
48 Examples are indexed by an integer between 0 and self.length()-1,
49 and a subdataset can be obtained by slicing.
50 """
51
52 def __init__(self,minibatch_size):
53 DataSet.__init__(self,minibatch_size)
54
55 def __iter__(self):
56 return FiniteDataSetIterator(self)
57
58 def __len__(self): 137 def __len__(self):
59 """len(dataset) returns the number of examples in the dataset.""" 138 """len(dataset) returns the number of examples in the dataset."""
60 raise NotImplementedError 139 raise NotImplementedError
61 140
62 def __getitem__(self,i): 141 def __getitem__(self,i):
66 def __getslice__(self,*slice_args): 145 def __getslice__(self,*slice_args):
67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 146 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
68 raise NotImplementedError 147 raise NotImplementedError
69 148
70 class FiniteDataSetIterator(object): 149 class FiniteDataSetIterator(object):
71 def __init__(self,dataset): 150 """
151 If the fieldnames list is empty, it means that we want to see ALL the fields.
152 """
153 def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
72 self.dataset=dataset 154 self.dataset=dataset
73 self.current = -self.dataset.minibatch_size 155 self.minibatch_size=minibatch_size
74 156 assert minibatch_size>=1 and minibatch_size<=len(dataset)
157 self.current = -self.minibatch_size
158 self.fieldnames = fieldnames
159
160 def __iter__(self):
161 return self
162
75 def next(self): 163 def next(self):
76 """ 164 self.current+=self.minibatch_size
77 Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that
78 many examples. If the dataset has fields, the example or the minibatch of examples
79 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
80 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
81 example-wise on it. On the other hand, if the dataset has no fields (e.g. because
82 it is already the field of a bigger dataset), then the returned example or minibatch
83 may be any indexable object, such as a numpy array. Following the array semantics of indexing
84 and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array
85 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
86 to a row. Again, if the minibatch_size is >1, one can iterate on the result to
87 obtain individual examples (as rows).
88 """
89 self.current+=self.dataset.minibatch_size
90 if self.current>=len(self.dataset): 165 if self.current>=len(self.dataset):
91 self.current=-self.dataset.minibatch_size 166 self.current=-self.minibatch_size
92 raise StopIteration 167 raise StopIteration
93 if self.dataset.minibatch_size==1: 168 if self.minibatch_size==1:
94 return self.dataset[self.current] 169 complete_example=self.dataset[self.current]
95 else: 170 else:
96 return self.dataset[self.current:self.current+self.dataset.minibatch_size] 171 complete_example=self.dataset[self.current:self.current+self.minibatch_size]
172 if self.fieldnames:
173 return Example(self.fieldnames,list(complete_example))
174 else:
175 return complete_example
97 176
98 177
99 # we may want ArrayDataSet defined in another python file 178 # we may want ArrayDataSet defined in another python file
100 179
101 import numpy 180 import numpy
102 181
103 class ArrayDataSet(FiniteDataSet): 182 class ArrayDataSet(FiniteDataSet):
104 """ 183 """
105 An ArrayDataSet behaves like a numpy array but adds the notion of fields 184 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
106 and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset 185 from DataSet (and the ability to view multiple field values as an 'Example').
186 It is a fixed-length and fixed-width dataset
107 in which each element is a numpy array or a number, hence the whole 187 in which each element is a numpy array or a number, hence the whole
108 dataset corresponds to a numpy array. Fields 188 dataset corresponds to a numpy array. Fields
109 must correspond to a slice of array columns. If the dataset has fields, 189 must correspond to a slice of array columns. If the dataset has fields,
110 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. 190 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
111 Any dataset can also be converted to a numpy array (losing the notion of fields 191 Any dataset can also be converted to a numpy array (losing the notion of fields
112 and of minibatch_size) by the numpy.array(dataset) call. 192 by the numpy.array(dataset) call.
113 """ 193 """
114 194
115 def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): 195 def __init__(self,dataset=None,data=None,fields={}):
116 """ 196 """
117 There are two ways to construct an ArrayDataSet: (1) from an 197 There are two ways to construct an ArrayDataSet: (1) from an
118 existing dataset (which may result in a copy of the data in a numpy array), 198 existing dataset (which may result in a copy of the data in a numpy array),
119 or (2) from a numpy.array (the data argument), along with an optional description 199 or (2) from a numpy.array (the data argument), along with an optional description
120 of the fields (dictionary of column slices indexed by field names). 200 of the fields (dictionary of column slices indexed by field names).
121 """ 201 """
122 FiniteDataSet.__init__(self,minibatch_size)
123 if dataset!=None: 202 if dataset!=None:
124 assert data==None and fields=={} 203 assert data==None and fields=={}
125 # convert dataset to an ArrayDataSet 204 # Make ONE big minibatch with all the examples, to separate the fields.
205 n_examples=len(dataset)
206 batch = dataset.minibatches(n_examples).next()
207 # Each field of the underlying dataset must be convertible to a numpy array of the same type
208 # currently just double, but should use the smallest compatible dtype
209 n_fields = len(batch)
210 fieldnames = batch.fields.keys()
211 total_width = 0
212 type = None
213 for i in xrange(n_fields):
214 field = array(batch[i])
215 assert field.shape[0]==n_examples
216 width = field.shape[1]
217 start=total_width
218 total_width += width
219 fields[fieldnames[i]]=slice(start,total_width,1)
220 # many complicated things remain to be done:
221 # - find common dtype
222 # - decide what to do with extra dimensions if not the same in all fields
223 # - try to see if we can avoid the copy?
126 raise NotImplementedError 224 raise NotImplementedError
127 if data!=None: 225 if data!=None:
128 assert dataset==None 226 assert dataset==None
129 self.data=data 227 self.data=data
130 self.fields=fields 228 self.fields=fields
140 step=1 238 step=1
141 if not fieldslice.start or not fieldslice.step: 239 if not fieldslice.start or not fieldslice.step:
142 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) 240 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
143 # and coherent with the data array 241 # and coherent with the data array
144 assert fieldslice.start>=0 and fieldslice.stop<=self.width 242 assert fieldslice.start>=0 and fieldslice.stop<=self.width
145 assert minibatch_size<=len(self.data)
146 243
147 def __getattr__(self,fieldname): 244 def __getattr__(self,fieldname):
148 """ 245 """
149 Return a numpy array with the content associated with the given field name. 246 Return a numpy array with the content associated with the given field name.
150 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension 247 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
151 than the dataset.data) is returned. 248 than the dataset itself) is returned.
152 """ 249 """
153 if len(self.data)==1: 250 if len(self.data)==1:
154 return self.data[0,self.fields[fieldname]] 251 return self.data[0,self.fields[fieldname]]
155 return self.data[:,self.fields[fieldname]] 252 return self.data[:,self.fields[fieldname]]
156 253
162 min_col=min(min_col,field_slice.start) 259 min_col=min(min_col,field_slice.start)
163 max_col=max(max_col,field_slice.stop) 260 max_col=max(max_col,field_slice.stop)
164 new_fields={} 261 new_fields={}
165 for field in self.fields: 262 for field in self.fields:
166 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) 263 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
167 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) 264 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
168 265
169 def fieldNames(self): 266 def fieldNames(self):
170 """Return the list of field names that are supported by getattr and getFields.""" 267 """Return the list of field names that are supported by getattr and getFields."""
171 return self.fields.keys() 268 return self.fields.keys()
172 269
174 """len(dataset) returns the number of examples in the dataset.""" 271 """len(dataset) returns the number of examples in the dataset."""
175 return len(self.data) 272 return len(self.data)
176 273
177 def __getitem__(self,i): 274 def __getitem__(self,i):
178 """ 275 """
179 dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields 276 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
180 then a one-example dataset is returned (to be able to handle example.field accesses). 277 the result is just a numpy array (for the i-th row of the dataset data matrix).
181 """ 278 """
182 if self.fields: 279 if self.fields:
183 if isinstance(i,slice): 280 fieldnames,fieldslices=zip(*self.fields.items())
184 return ArrayDataSet(data=data[slice],fields=self.fields) 281 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
185 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields)
186 else: 282 else:
187 return self.data[i] 283 return self.data[i]
188 284
189 def __getslice__(self,*slice_args): 285 def __getslice__(self,*slice_args):
190 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 286 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
230 slice_width=field_slice.stop-field_slice.start/field_slice.step 326 slice_width=field_slice.stop-field_slice.start/field_slice.step
231 # copy the field here 327 # copy the field here
232 result[:,slice(c,slice_width)]=self.data[:,field_slice] 328 result[:,slice(c,slice_width)]=self.data[:,field_slice]
233 c+=slice_width 329 c+=slice_width
234 return result 330 return result
331
332