comparison dataset.py @ 16:813723310d75

commenting
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 18:23:44 -0400
parents 88168361a5ab be128b9127c8
children 759d17112b23
comparison
equal deleted inserted replaced
15:88168361a5ab 16:813723310d75
1
2 class Example(object):
3 """
4 An example is something that is like a tuple but whose elements can be named, to that
5 following syntactic constructions work as one would expect:
6 example.x = [1, 2, 3] # set a field
7 x, y, z = example
8 x = example[0]
9 x = example["x"]
10 """
11 def __init__(self,names,values):
12 assert len(values)==len(names)
13 self.__dict__['values']=values
14 self.__dict__['fields']={}
15 for i in xrange(len(values)):
16 self.fields[names[i]]=i
17
18 def __getitem__(self,i):
19 if isinstance(i,int):
20 return self.values[i]
21 else:
22 return self.values[self.fields[i]]
23
24 def __setitem__(self,i,value):
25 if isinstance(i,int):
26 self.values[i]=value
27 else:
28 self.values[self.fields[i]]=value
29
30 def __getattr__(self,name):
31 return self.values[self.fields[name]]
32
33 def __setattr__(self,name,value):
34 self.values[self.fields[name]]=value
35
36 def __len__(self):
37 return len(self.values)
1 38
2 39
3 class DataSet(object): 40 class DataSet(object):
4 """ 41 """A virtual base class for datasets.
5 This is a virtual base class or interface for datasets. 42
6 A dataset is basically an iterator over examples. It does not necessarily 43 A DataSet is a generator of iterators; these iterators can run through the
7 have a fixed length (this is useful for 'streams' which feed on-line learning). 44 examples in a variety of ways. A DataSet need not necessarily have a finite
8 Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. 45 or known length, so this class can be used to interface to a 'stream' which
9 Examples and datasets optionally have named fields. 46 feed on-line learning.
10 One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). 47
11 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. 48 To iterate over examples, there are several possibilities:
12 The content of a field can be of any type, but often will be a numpy array. 49 - for i in dataset.zip(field1, field2,field3, ...)
13 The minibatch_size attribute, if different than 1, means that the iterator (next() method) 50 - for i in dataset.minibatches(N, field1, field2, ...)
14 returns not a single example but an array of length minibatch_size, i.e., an indexable 51 - for i in dataset
15 object with minibatch_size examples in it. 52 Each of these is documented below.
16 """ 53
17 54 Note: For a dataset of fixed and known length, which can implement item
18 def __init__(self,minibatch_size=1): 55 random-access efficiently (e.g. indexing and slicing), and which can profit
19 assert minibatch_size>0 56 from the FiniteDataSetIterator, consider using base class FiniteDataSet.
20 self.minibatch_size=minibatch_size 57
21 58 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
59
60 Note: The content of a field can be of any type.
61
62 """
63
64 def __init__(self):
65 pass
66
22 def __iter__(self): 67 def __iter__(self):
23 """ 68 """Supports the syntax "for i in dataset: ..."
69
70 Using this syntax, "i" will be an Example instance (or equivalent) with
71 all the fields of DataSet self. Every field of "i" will give access to
72 a the field of a single example. Fields should be accessible via
73 i[identifier], but the derived class is free to accept any type of
74 identifier, and add extra functionality to the iterator.
75 """
76 raise NotImplementedError
77
78 def zip(self, *fieldnames):
79 """
80 Supports two forms of syntax:
81
82 for i in dataset.zip(f1, f2, f3): ...
83
84 for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
85
86 Using the first syntax, "i" will be an indexable object, such as a list,
87 tuple, or Example instance, such that on every iteration, i[0] is the f1
88 field of the current example, i[1] is the f2 field, and so on.
89
90 Using the second syntax, i1, i2, i3 will contain the the contents of the
91 f1, f2, and f3 fields of a single example on each loop iteration.
92
93 The derived class may accept fieldname arguments of any type.
94
95 """
96 raise NotImplementedError
97
98 def minibatches(self,minibatch_size,*fieldnames):
99 """
100 Supports two forms of syntax:
101
102 for i in dataset.zip(f1, f2, f3): ...
103
104 for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
105
106 Using the first syntax, "i" will be an indexable object, such as a list,
107 tuple, or Example instance, such that on every iteration, i[0] is the f1
108 field of the current example, i[1] is the f2 field, and so on.
109
110 Using the second syntax, i1, i2, i3 will contain the the contents of the
111 f1, f2, and f3 fields of a single example on each loop iteration.
112
113 The derived class may accept fieldname arguments of any type.
114
24 Return an iterator, whose next() method returns the next example or the next 115 Return an iterator, whose next() method returns the next example or the next
25 minibatch in the dataset. A minibatch (of length > 1) should be something one 116 minibatch in the dataset. A minibatch (of length > 1) is also an example, but
26 can iterate on again in order to obtain the individual examples. If the dataset 117 whose fields should be something one can iterate on again in order to obtain
27 has fields, then the example or the minibatch must have the same fields 118 the individual examples.
28 (typically this is implemented by returning another smaller dataset, when 119
29 there are fields). 120 DataSet.zip returns an iterator over only the desired fields, and each field
30 """ 121 of the iterator contains one example.
31 raise NotImplementedError 122
32 123 Return an iterator which sees only the specified fields (each fieldname is a
33 def __getattr__(self,fieldname): 124 field key, typically a string). The value returned at each iteration
34 """Return a sub-dataset containing only the given fieldname as field.""" 125 is a tuple with one element per field. Hence it can be used like this:
35 return self(fieldname) 126 for f1, f2, f3 in dataset.zip('field1','field2','field3'):
36 127 ... use f1, f2, and f3
37 def __call__(self,*fieldnames): 128 If one iterates through minibatches of examples (with the minibatches() method
38 """Return a sub-dataset containing only the given fieldnames as fields.""" 129 or with the minibatch_size argument of the zip() method), then the fields
39 raise NotImplementedError 130 returned by the iterator's next method should be iterators over the
40 131 individual values within the minibatch (typically these will be arrays
132 with minibatch_size rows).
133 Similar to zip but iterate over minibatches.
134 Return a minibatch iterator, whose next() method returns an 'example'
135 whose fields are iteratable objects (which can iterate over the individual
136 values of that field in the minibatch).
137 """
138 raise NotImplementedError
139
41 def fieldNames(self): 140 def fieldNames(self):
42 """Return the list of field names that are supported by getattr and getFields.""" 141 """Return the list of field names in the examples of this dataset."""
142 raise NotImplementedError
143
144 def rename(*new_field_specifications):
145 """
146 Return a new dataset that maps old fields (of self) to new fields (of the returned
147 dataset). The minimal syntax that should be supported is the following:
148 new_field_specifications = [new_field_spec1, new_field_spec2, ...]
149 new_field_spec = ([old_field1, old_field2, ...], new_field)
150 In general both old_field and new_field should be strings, but some datasets may also
151 support additional indexing schemes within each field (e.g. column slice
152 of a matrix-like field).
153 """
43 raise NotImplementedError 154 raise NotImplementedError
44 155
45 class FiniteDataSet(DataSet): 156 class FiniteDataSet(DataSet):
46 """ 157 """
47 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. 158 Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
48 Examples are indexed by an integer between 0 and self.length()-1, 159 Examples are indexed by an integer between 0 and self.length()-1,
49 and a subdataset can be obtained by slicing. 160 and a subdataset can be obtained by slicing. This may not be appropriate in general
50 """ 161 but only for datasets which can be thought of like ones that access rows AND fields
51 162 in an efficient random access way. Users are encouraged to expect only the generic dataset
52 def __init__(self,minibatch_size): 163 interface in general. A FiniteDataSet is mainly useful when one has to obtain
53 DataSet.__init__(self,minibatch_size) 164 a subset of examples (e.g. for splitting a dataset into training and test sets).
165 """
166
167 def __init__(self):
168 pass
54 169
55 def __iter__(self): 170 def __iter__(self):
56 return FiniteDataSetIterator(self) 171 return FiniteDataSetIterator(self)
57 172
173 def zip(self,*fieldnames):
174 return FiniteDataSetIterator(self,1,fieldnames)
175
176 def minibatches(self,minibatch_size,*fieldnames):
177 return FiniteDataSetIterator(self,minibatch_size,fieldnames)
178
179 def __getattr__(self,fieldname):
180 """Return an that can iterate over the values of the field in this dataset."""
181 return self(fieldname)
182
183 def __call__(self,*fieldnames):
184 """Return a sub-dataset containing only the given fieldnames as fields.
185
186 The return value's default iterator will iterate only over the given
187 fields.
188 """
189 raise NotImplementedError
190
58 def __len__(self): 191 def __len__(self):
59 """len(dataset) returns the number of examples in the dataset.""" 192 """len(dataset) returns the number of examples in the dataset."""
60 raise NotImplementedError 193 raise NotImplementedError
61 194
62 def __getitem__(self,i): 195 def __getitem__(self,i):
66 def __getslice__(self,*slice_args): 199 def __getslice__(self,*slice_args):
67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 200 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
68 raise NotImplementedError 201 raise NotImplementedError
69 202
70 class FiniteDataSetIterator(object): 203 class FiniteDataSetIterator(object):
71 def __init__(self,dataset): 204 """
205 If the fieldnames list is empty, it means that we want to see ALL the fields.
206 """
207 def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
72 self.dataset=dataset 208 self.dataset=dataset
73 self.current = -self.dataset.minibatch_size 209 self.minibatch_size=minibatch_size
74 210 assert minibatch_size>=1 and minibatch_size<=len(dataset)
211 self.current = -self.minibatch_size
212 self.fieldnames = fieldnames
213
214 def __iter__(self):
215 return self
216
75 def next(self): 217 def next(self):
76 """ 218 self.current+=self.minibatch_size
77 Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that
78 many examples. If the dataset has fields, the example or the minibatch of examples
79 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
80 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
81 example-wise on it. On the other hand, if the dataset has no fields (e.g. because
82 it is already the field of a bigger dataset), then the returned example or minibatch
83 may be any indexable object, such as a numpy array. Following the array semantics of indexing
84 and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array
85 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
86 to a row. Again, if the minibatch_size is >1, one can iterate on the result to
87 obtain individual examples (as rows).
88 """
89 self.current+=self.dataset.minibatch_size
90 if self.current>=len(self.dataset): 219 if self.current>=len(self.dataset):
91 self.current=-self.dataset.minibatch_size 220 self.current=-self.minibatch_size
92 raise StopIteration 221 raise StopIteration
93 if self.dataset.minibatch_size==1: 222 if self.minibatch_size==1:
94 return self.dataset[self.current] 223 complete_example=self.dataset[self.current]
95 else: 224 else:
96 return self.dataset[self.current:self.current+self.dataset.minibatch_size] 225 complete_example=self.dataset[self.current:self.current+self.minibatch_size]
226 if self.fieldnames:
227 return Example(self.fieldnames,list(complete_example))
228 else:
229 return complete_example
97 230
98 231
99 # we may want ArrayDataSet defined in another python file 232 # we may want ArrayDataSet defined in another python file
100 233
101 import numpy 234 import numpy
102 235
103 class ArrayDataSet(FiniteDataSet): 236 class ArrayDataSet(FiniteDataSet):
104 """ 237 """
105 An ArrayDataSet behaves like a numpy array but adds the notion of fields 238 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
106 and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset 239 from DataSet (and the ability to view multiple field values as an 'Example').
240 It is a fixed-length and fixed-width dataset
107 in which each element is a numpy array or a number, hence the whole 241 in which each element is a numpy array or a number, hence the whole
108 dataset corresponds to a numpy array. Fields 242 dataset corresponds to a numpy array. Fields
109 must correspond to a slice of array columns. If the dataset has fields, 243 must correspond to a slice of array columns. If the dataset has fields,
110 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. 244 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
111 Any dataset can also be converted to a numpy array (losing the notion of fields 245 Any dataset can also be converted to a numpy array (losing the notion of fields
112 and of minibatch_size) by the numpy.array(dataset) call. 246 by the numpy.array(dataset) call.
113 """ 247 """
114 248
115 def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): 249 def __init__(self,dataset=None,data=None,fields={}):
116 """ 250 """
117 There are two ways to construct an ArrayDataSet: (1) from an 251 There are two ways to construct an ArrayDataSet: (1) from an
118 existing dataset (which may result in a copy of the data in a numpy array), 252 existing dataset (which may result in a copy of the data in a numpy array),
119 or (2) from a numpy.array (the data argument), along with an optional description 253 or (2) from a numpy.array (the data argument), along with an optional description
120 of the fields (dictionary of column slices indexed by field names). 254 of the fields (dictionary of column slices indexed by field names).
121 """ 255 """
122 FiniteDataSet.__init__(self,minibatch_size)
123 if dataset!=None: 256 if dataset!=None:
124 assert data==None and fields=={} 257 assert data==None and fields=={}
125 # convert dataset to an ArrayDataSet 258 # Make ONE big minibatch with all the examples, to separate the fields.
259 n_examples=len(dataset)
260 batch = dataset.minibatches(n_examples).next()
261 # Each field of the underlying dataset must be convertible to a numpy array of the same type
262 # currently just double, but should use the smallest compatible dtype
263 n_fields = len(batch)
264 fieldnames = batch.fields.keys()
265 total_width = 0
266 type = None
267 for i in xrange(n_fields):
268 field = array(batch[i])
269 assert field.shape[0]==n_examples
270 width = field.shape[1]
271 start=total_width
272 total_width += width
273 fields[fieldnames[i]]=slice(start,total_width,1)
274 # many complicated things remain to be done:
275 # - find common dtype
276 # - decide what to do with extra dimensions if not the same in all fields
277 # - try to see if we can avoid the copy?
126 raise NotImplementedError 278 raise NotImplementedError
127 if data!=None: 279 if data!=None:
128 assert dataset==None 280 assert dataset==None
129 self.data=data 281 self.data=data
130 self.fields=fields 282 self.fields=fields
140 step=1 292 step=1
141 if not fieldslice.start or not fieldslice.step: 293 if not fieldslice.start or not fieldslice.step:
142 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) 294 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
143 # and coherent with the data array 295 # and coherent with the data array
144 assert fieldslice.start>=0 and fieldslice.stop<=self.width 296 assert fieldslice.start>=0 and fieldslice.stop<=self.width
145 assert minibatch_size<=len(self.data)
146 297
147 def __getattr__(self,fieldname): 298 def __getattr__(self,fieldname):
148 """ 299 """
149 Return a numpy array with the content associated with the given field name. 300 Return a numpy array with the content associated with the given field name.
150 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension 301 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
151 than the dataset.data) is returned. 302 than the dataset itself) is returned.
152 """ 303 """
153 if len(self.data)==1: 304 if len(self.data)==1:
154 return self.data[0,self.fields[fieldname]] 305 return self.data[0,self.fields[fieldname]]
155 return self.data[:,self.fields[fieldname]] 306 return self.data[:,self.fields[fieldname]]
156 307
162 min_col=min(min_col,field_slice.start) 313 min_col=min(min_col,field_slice.start)
163 max_col=max(max_col,field_slice.stop) 314 max_col=max(max_col,field_slice.stop)
164 new_fields={} 315 new_fields={}
165 for field in self.fields: 316 for field in self.fields:
166 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) 317 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
167 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) 318 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
168 319
169 def fieldNames(self): 320 def fieldNames(self):
170 """Return the list of field names that are supported by getattr and getFields.""" 321 """Return the list of field names that are supported by getattr and getFields."""
171 return self.fields.keys() 322 return self.fields.keys()
172 323
174 """len(dataset) returns the number of examples in the dataset.""" 325 """len(dataset) returns the number of examples in the dataset."""
175 return len(self.data) 326 return len(self.data)
176 327
177 def __getitem__(self,i): 328 def __getitem__(self,i):
178 """ 329 """
179 dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields 330 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
180 then a one-example dataset is returned (to be able to handle example.field accesses). 331 the result is just a numpy array (for the i-th row of the dataset data matrix).
181 """ 332 """
182 if self.fields: 333 if self.fields:
183 if isinstance(i,slice): 334 fieldnames,fieldslices=zip(*self.fields.items())
184 return ArrayDataSet(data=data[slice],fields=self.fields) 335 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
185 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields)
186 else: 336 else:
187 return self.data[i] 337 return self.data[i]
188 338
189 def __getslice__(self,*slice_args): 339 def __getslice__(self,*slice_args):
190 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 340 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
239 slice_width=field_slice.stop-field_slice.start/field_slice.step 389 slice_width=field_slice.stop-field_slice.start/field_slice.step
240 # copy the field here 390 # copy the field here
241 result[:,slice(c,slice_width)]=self.data[:,field_slice] 391 result[:,slice(c,slice_width)]=self.data[:,field_slice]
242 c+=slice_width 392 c+=slice_width
243 return result 393 return result
394
395