Mercurial > pylearn
comparison dataset.py @ 17:759d17112b23
more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author | bergstrj@iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 21:05:14 -0400 |
parents | 813723310d75 ff4e551490f1 |
children | 57f4015e2e09 |
comparison
equal
deleted
inserted
replaced
16:813723310d75 | 17:759d17112b23 |
---|---|
1 | 1 |
2 class Example(object): | 2 from lookup_list import LookupList |
3 """ | 3 Example = LookupList |
4 An example is something that is like a tuple but whose elements can be named, to that | 4 |
5 following syntactic constructions work as one would expect: | 5 class AbstractFunction (Exception): """Derived class must override this function""" |
6 example.x = [1, 2, 3] # set a field | 6 |
7 x, y, z = example | |
8 x = example[0] | |
9 x = example["x"] | |
10 """ | |
11 def __init__(self,names,values): | |
12 assert len(values)==len(names) | |
13 self.__dict__['values']=values | |
14 self.__dict__['fields']={} | |
15 for i in xrange(len(values)): | |
16 self.fields[names[i]]=i | |
17 | |
18 def __getitem__(self,i): | |
19 if isinstance(i,int): | |
20 return self.values[i] | |
21 else: | |
22 return self.values[self.fields[i]] | |
23 | |
24 def __setitem__(self,i,value): | |
25 if isinstance(i,int): | |
26 self.values[i]=value | |
27 else: | |
28 self.values[self.fields[i]]=value | |
29 | |
30 def __getattr__(self,name): | |
31 return self.values[self.fields[name]] | |
32 | |
33 def __setattr__(self,name,value): | |
34 self.values[self.fields[name]]=value | |
35 | |
36 def __len__(self): | |
37 return len(self.values) | |
38 | |
39 | |
40 class DataSet(object): | 7 class DataSet(object): |
41 """A virtual base class for datasets. | 8 """A virtual base class for datasets. |
42 | 9 |
43 A DataSet is a generator of iterators; these iterators can run through the | 10 A DataSet is a generator of iterators; these iterators can run through the |
44 examples in a variety of ways. A DataSet need not necessarily have a finite | 11 examples in a variety of ways. A DataSet need not necessarily have a finite |
71 all the fields of DataSet self. Every field of "i" will give access to | 38 all the fields of DataSet self. Every field of "i" will give access to |
72 a the field of a single example. Fields should be accessible via | 39 a the field of a single example. Fields should be accessible via |
73 i[identifier], but the derived class is free to accept any type of | 40 i[identifier], but the derived class is free to accept any type of |
74 identifier, and add extra functionality to the iterator. | 41 identifier, and add extra functionality to the iterator. |
75 """ | 42 """ |
76 raise NotImplementedError | 43 for i in self.minibatches( minibatch_size = 1): |
44 yield Example(i.keys(), [v[0] for v in i.values()]) | |
77 | 45 |
78 def zip(self, *fieldnames): | 46 def zip(self, *fieldnames): |
79 """ | 47 """ |
80 Supports two forms of syntax: | 48 Supports two forms of syntax: |
81 | 49 |
91 f1, f2, and f3 fields of a single example on each loop iteration. | 59 f1, f2, and f3 fields of a single example on each loop iteration. |
92 | 60 |
93 The derived class may accept fieldname arguments of any type. | 61 The derived class may accept fieldname arguments of any type. |
94 | 62 |
95 """ | 63 """ |
96 raise NotImplementedError | 64 for i in self.minibatches(fieldnames, minibatch_size = 1): |
97 | 65 yield [f[0] for f in i] |
98 def minibatches(self,minibatch_size,*fieldnames): | 66 |
67 minibatches_fieldnames = None | |
68 minibatches_minibatch_size = 1 | |
69 minibatches_n_batches = None | |
70 def minibatches(self, | |
71 fieldnames = minibatches_fieldnames, | |
72 minibatch_size = minibatches_minibatch_size, | |
73 n_batches = minibatches_n_batches): | |
99 """ | 74 """ |
100 Supports two forms of syntax: | 75 Supports two forms of syntax: |
101 | 76 |
102 for i in dataset.zip(f1, f2, f3): ... | 77 for i in dataset.minibatches([f1, f2, f3],**kwargs): ... |
103 | 78 |
104 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... | 79 for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ... |
105 | 80 |
106 Using the first syntax, "i" will be an indexable object, such as a list, | 81 Using the first syntax, "i" will be an indexable object, such as a list, |
107 tuple, or Example instance, such that on every iteration, i[0] is the f1 | 82 tuple, or Example instance, such that on every iteration, i[0] is a |
108 field of the current example, i[1] is the f2 field, and so on. | 83 list-like container of the f1 field of a batch current examples, i[1] is |
109 | 84 a list-like container of the f2 field, etc. |
110 Using the second syntax, i1, i2, i3 will contain the the contents of the | 85 |
111 f1, f2, and f3 fields of a single example on each loop iteration. | 86 Using the second syntax, i1, i2, i3 will be list-like containers of the |
112 | 87 f1, f2, and f3 fields of a batch of examples on each loop iteration. |
113 The derived class may accept fieldname arguments of any type. | 88 |
114 | 89 PARAMETERS |
115 Return an iterator, whose next() method returns the next example or the next | 90 - fieldnames (list of any type, default None): |
116 minibatch in the dataset. A minibatch (of length > 1) is also an example, but | 91 The loop variables i1, i2, i3 (in the example above) should contain the |
117 whose fields should be something one can iterate on again in order to obtain | 92 f1, f2, and f3 fields of the current batch of examples. If None, the |
118 the individual examples. | 93 derived class can choose a default, e.g. all fields. |
119 | 94 |
120 DataSet.zip returns an iterator over only the desired fields, and each field | 95 - minibatch_size (integer, default 1) |
121 of the iterator contains one example. | 96 On every iteration, the variables i1, i2, i3 will have |
122 | 97 exactly minibatch_size elements. e.g. len(i1) == minibatch_size |
123 Return an iterator which sees only the specified fields (each fieldname is a | 98 |
124 field key, typically a string). The value returned at each iteration | 99 - n_batches (integer, default None) |
125 is a tuple with one element per field. Hence it can be used like this: | 100 The iterator will loop exactly this many times, and then stop. If None, |
126 for f1, f2, f3 in dataset.zip('field1','field2','field3'): | 101 the derived class can choose a default. If (-1), then the returned |
127 ... use f1, f2, and f3 | 102 iterator should support looping indefinitely. |
128 If one iterates through minibatches of examples (with the minibatches() method | 103 |
129 or with the minibatch_size argument of the zip() method), then the fields | 104 Note: A list-like container is something like a tuple, list, numpy.ndarray or |
130 returned by the iterator's next method should be iterators over the | 105 any other object that supports integer indexing and slicing. |
131 individual values within the minibatch (typically these will be arrays | 106 |
132 with minibatch_size rows). | 107 """ |
133 Similar to zip but iterate over minibatches. | 108 raise AbstractFunction() |
134 Return a minibatch iterator, whose next() method returns an 'example' | |
135 whose fields are iteratable objects (which can iterate over the individual | |
136 values of that field in the minibatch). | |
137 """ | |
138 raise NotImplementedError | |
139 | 109 |
140 def fieldNames(self): | 110 def fieldNames(self): |
111 #Yoshua- | |
112 # This list may not be finite; what would make sense in the use you have | |
113 # in mind? | |
114 # -JB | |
141 """Return the list of field names in the examples of this dataset.""" | 115 """Return the list of field names in the examples of this dataset.""" |
142 raise NotImplementedError | 116 raise AbstractFunction() |
143 | 117 |
144 def rename(*new_field_specifications): | 118 def rename(*new_field_specifications): |
119 #Yoshua- | |
120 # Do you mean for this to be a virtual method? | |
121 # Wouldn't this functionality be easier to provide via a | |
122 # RenamingDataSet, such as the one I've written below? | |
123 # -JB | |
145 """ | 124 """ |
146 Return a new dataset that maps old fields (of self) to new fields (of the returned | 125 Return a new dataset that maps old fields (of self) to new fields (of the returned |
147 dataset). The minimal syntax that should be supported is the following: | 126 dataset). The minimal syntax that should be supported is the following: |
148 new_field_specifications = [new_field_spec1, new_field_spec2, ...] | 127 new_field_specifications = [new_field_spec1, new_field_spec2, ...] |
149 new_field_spec = ([old_field1, old_field2, ...], new_field) | 128 new_field_spec = ([old_field1, old_field2, ...], new_field) |
150 In general both old_field and new_field should be strings, but some datasets may also | 129 In general both old_field and new_field should be strings, but some datasets may also |
151 support additional indexing schemes within each field (e.g. column slice | 130 support additional indexing schemes within each field (e.g. column slice |
152 of a matrix-like field). | 131 of a matrix-like field). |
153 """ | 132 """ |
154 raise NotImplementedError | 133 raise AbstractFunction() |
134 | |
135 class RenamingDataSet(DataSet): | |
136 """A DataSet that wraps another one, and makes it look like the field names | |
137 are different | |
138 | |
139 Renaming is done by a dictionary that maps new names to the old ones used in | |
140 self.src. | |
141 """ | |
142 def __init__(self, src, rename_dct): | |
143 DataSet.__init__(self) | |
144 self.src = src | |
145 self.rename_dct = copy.copy(rename_dct) | |
146 | |
147 def minibatches(self, | |
148 fieldnames = DataSet.minibatches_fieldnames, | |
149 minibatch_size = DataSet.minibatches_minibatch_size, | |
150 n_batches = DataSet.minibatches_n_batches): | |
151 dct = self.rename_dct | |
152 new_fieldnames = [dct.get(f, f) for f in fieldnames] | |
153 return self.src.minibatches(new_fieldnames, minibatches_size, n_batches) | |
154 | |
155 def fieldNames(self): | |
156 return [dct.get(f, f) for f in self.src.fieldNames()] | |
157 | |
155 | 158 |
156 class FiniteDataSet(DataSet): | 159 class FiniteDataSet(DataSet): |
157 """ | 160 """ |
158 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. | 161 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. |
159 Examples are indexed by an integer between 0 and self.length()-1, | 162 Examples are indexed by an integer between 0 and self.length()-1, |
162 in an efficient random access way. Users are encouraged to expect only the generic dataset | 165 in an efficient random access way. Users are encouraged to expect only the generic dataset |
163 interface in general. A FiniteDataSet is mainly useful when one has to obtain | 166 interface in general. A FiniteDataSet is mainly useful when one has to obtain |
164 a subset of examples (e.g. for splitting a dataset into training and test sets). | 167 a subset of examples (e.g. for splitting a dataset into training and test sets). |
165 """ | 168 """ |
166 | 169 |
170 class FiniteDataSetIterator(object): | |
171 """ | |
172 If the fieldnames list is empty, it means that we want to see ALL the fields. | |
173 """ | |
174 def __init__(self,dataset,minibatch_size=1,fieldnames=[]): | |
175 self.dataset=dataset | |
176 self.minibatch_size=minibatch_size | |
177 assert minibatch_size>=1 and minibatch_size<=len(dataset) | |
178 self.current = -self.minibatch_size | |
179 self.fieldnames = fieldnames | |
180 | |
181 def __iter__(self): | |
182 return self | |
183 | |
184 def next(self): | |
185 self.current+=self.minibatch_size | |
186 if self.current>=len(self.dataset): | |
187 self.current=-self.minibatch_size | |
188 raise StopIteration | |
189 if self.minibatch_size==1: | |
190 complete_example=self.dataset[self.current] | |
191 else: | |
192 complete_example=self.dataset[self.current:self.current+self.minibatch_size] | |
193 if self.fieldnames: | |
194 return Example(self.fieldnames,list(complete_example)) | |
195 else: | |
196 return complete_example | |
197 | |
167 def __init__(self): | 198 def __init__(self): |
168 pass | 199 pass |
169 | 200 |
170 def __iter__(self): | 201 def minibatches(self, |
171 return FiniteDataSetIterator(self) | 202 fieldnames = DataSet.minibatches_fieldnames, |
172 | 203 minibatch_size = DataSet.minibatches_minibatch_size, |
173 def zip(self,*fieldnames): | 204 n_batches = DataSet.minibatches_n_batches): |
174 return FiniteDataSetIterator(self,1,fieldnames) | 205 """ |
175 | 206 If the fieldnames list is empty, it means that we want to see ALL the fields. |
176 def minibatches(self,minibatch_size,*fieldnames): | 207 |
177 return FiniteDataSetIterator(self,minibatch_size,fieldnames) | 208 If the n_batches is empty, we want to see all the examples possible |
209 for the give minibatch_size. | |
210 """ | |
211 # substitute the defaults: | |
212 if fieldnames is None: fieldnames = self.fieldNames() | |
213 if n_batches is None: n_batches = len(self) / minibatch_size | |
214 return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) | |
178 | 215 |
179 def __getattr__(self,fieldname): | 216 def __getattr__(self,fieldname): |
180 """Return an that can iterate over the values of the field in this dataset.""" | 217 """Return an that can iterate over the values of the field in this dataset.""" |
181 return self(fieldname) | 218 return self(fieldname) |
182 | 219 |
184 """Return a sub-dataset containing only the given fieldnames as fields. | 221 """Return a sub-dataset containing only the given fieldnames as fields. |
185 | 222 |
186 The return value's default iterator will iterate only over the given | 223 The return value's default iterator will iterate only over the given |
187 fields. | 224 fields. |
188 """ | 225 """ |
189 raise NotImplementedError | 226 raise AbstractFunction() |
190 | 227 |
191 def __len__(self): | 228 def __len__(self): |
192 """len(dataset) returns the number of examples in the dataset.""" | 229 """len(dataset) returns the number of examples in the dataset.""" |
193 raise NotImplementedError | 230 raise AbstractFunction() |
194 | 231 |
195 def __getitem__(self,i): | 232 def __getitem__(self,i): |
196 """dataset[i] returns the (i+1)-th example of the dataset.""" | 233 """dataset[i] returns the (i+1)-th example of the dataset.""" |
197 raise NotImplementedError | 234 raise AbstractFunction() |
198 | 235 |
199 def __getslice__(self,*slice_args): | 236 def __getslice__(self,*slice_args): |
200 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 237 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
201 raise NotImplementedError | 238 raise AbstractFunction() |
202 | |
203 class FiniteDataSetIterator(object): | |
204 """ | |
205 If the fieldnames list is empty, it means that we want to see ALL the fields. | |
206 """ | |
207 def __init__(self,dataset,minibatch_size=1,fieldnames=[]): | |
208 self.dataset=dataset | |
209 self.minibatch_size=minibatch_size | |
210 assert minibatch_size>=1 and minibatch_size<=len(dataset) | |
211 self.current = -self.minibatch_size | |
212 self.fieldnames = fieldnames | |
213 | |
214 def __iter__(self): | |
215 return self | |
216 | |
217 def next(self): | |
218 self.current+=self.minibatch_size | |
219 if self.current>=len(self.dataset): | |
220 self.current=-self.minibatch_size | |
221 raise StopIteration | |
222 if self.minibatch_size==1: | |
223 complete_example=self.dataset[self.current] | |
224 else: | |
225 complete_example=self.dataset[self.current:self.current+self.minibatch_size] | |
226 if self.fieldnames: | |
227 return Example(self.fieldnames,list(complete_example)) | |
228 else: | |
229 return complete_example | |
230 | |
231 | 239 |
232 # we may want ArrayDataSet defined in another python file | 240 # we may want ArrayDataSet defined in another python file |
233 | 241 |
234 import numpy | 242 import numpy |
243 | |
244 def as_array_dataset(dataset): | |
245 # Generally datasets can be efficient by making data fields overlap, but | |
246 # this function doesn't know which fields overlap. So, it should check if | |
247 # dataset supports an as_array_dataset member function, and return that if | |
248 # possible. | |
249 if hasattr(dataset, 'as_array_dataset'): | |
250 return dataset.as_array_dataset() | |
251 | |
252 raise NotImplementedError() | |
253 | |
254 # Make ONE big minibatch with all the examples, to separate the fields. | |
255 n_examples = len(dataset) | |
256 batch = dataset.minibatches( minibatch_size = len(dataset)).next() | |
257 | |
258 # Each field of the underlying dataset must be convertible to a numpy array of the same type | |
259 # currently just double, but should use the smallest compatible dtype | |
260 n_fields = len(batch) | |
261 fieldnames = batch.fields.keys() | |
262 total_width = 0 | |
263 type = None | |
264 fields = LookupList() | |
265 for i in xrange(n_fields): | |
266 field = array(batch[i]) | |
267 assert field.shape[0]==n_examples | |
268 width = field.shape[1] | |
269 start=total_width | |
270 total_width += width | |
271 fields[fieldnames[i]]=slice(start,total_width,1) | |
272 # many complicated things remain to be done: | |
273 # - find common dtype | |
274 # - decide what to do with extra dimensions if not the same in all fields | |
275 # - try to see if we can avoid the copy? | |
235 | 276 |
236 class ArrayDataSet(FiniteDataSet): | 277 class ArrayDataSet(FiniteDataSet): |
237 """ | 278 """ |
238 An ArrayDataSet behaves like a numpy array but adds the notion of named fields | 279 An ArrayDataSet behaves like a numpy array but adds the notion of named fields |
239 from DataSet (and the ability to view multiple field values as an 'Example'). | 280 from DataSet (and the ability to view multiple field values as an 'Example'). |
244 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. | 285 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. |
245 Any dataset can also be converted to a numpy array (losing the notion of fields | 286 Any dataset can also be converted to a numpy array (losing the notion of fields |
246 by the numpy.array(dataset) call. | 287 by the numpy.array(dataset) call. |
247 """ | 288 """ |
248 | 289 |
249 def __init__(self,dataset=None,data=None,fields={}): | 290 class Iterator(object): |
291 """An iterator over a finite dataset that implements wrap-around""" | |
292 def __init__(self, dataset, fieldnames, minibatch_size, next_max): | |
293 self.dataset=dataset | |
294 self.fieldnames = fieldnames | |
295 self.minibatch_size=minibatch_size | |
296 self.next_count = 0 | |
297 self.next_max = next_max | |
298 self.current = -self.minibatch_size | |
299 assert minibatch_size > 0 | |
300 if minibatch_size >= len(dataset): | |
301 raise NotImplementedError() | |
302 | |
303 def __iter__(self): | |
304 #Why do we do this? -JB | |
305 return self | |
306 | |
307 @staticmethod | |
308 def matcat(a, b): | |
309 a0, a1 = a.shape | |
310 b0, b1 = b.shape | |
311 assert a1 == b1 | |
312 assert a.dtype is b.dtype | |
313 rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype) | |
314 rval[:a0,:] = a | |
315 rval[a0:,:] = b | |
316 return rval | |
317 | |
318 def next(self): | |
319 | |
320 #check for end-of-loop | |
321 self.next_count += 1 | |
322 if self.next_count == self.next_max: | |
323 raise StopIteration | |
324 | |
325 #determine the first and last elements of the slice we'll return | |
326 self.current += self.minibatch_size | |
327 if self.current >= len(self.dataset): | |
328 self.current -= len(self.dataset) | |
329 upper = self.current + self.minibatch_size | |
330 | |
331 if upper <= len(self.dataset): | |
332 #this is the easy case, we only need once slice | |
333 dataview = self.dataset.data[self.current:upper] | |
334 else: | |
335 # the minibatch wraps around the end of the dataset | |
336 dataview = self.dataset.data[self.current:] | |
337 upper -= len(self.dataset) | |
338 assert upper > 0 | |
339 dataview = self.matcat(dataview, self.dataset.data[:upper]) | |
340 | |
341 | |
342 rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames] | |
343 | |
344 if self.fieldnames: | |
345 rval = Example(self.fieldnames, rval) | |
346 | |
347 return rval | |
348 | |
349 | |
350 def __init__(self, data, fields=None): | |
250 """ | 351 """ |
251 There are two ways to construct an ArrayDataSet: (1) from an | 352 There are two ways to construct an ArrayDataSet: (1) from an |
252 existing dataset (which may result in a copy of the data in a numpy array), | 353 existing dataset (which may result in a copy of the data in a numpy array), |
253 or (2) from a numpy.array (the data argument), along with an optional description | 354 or (2) from a numpy.array (the data argument), along with an optional description |
254 of the fields (dictionary of column slices indexed by field names). | 355 of the fields (a LookupList of column slices indexed by field names). |
255 """ | 356 """ |
256 if dataset!=None: | 357 self.data=data |
257 assert data==None and fields=={} | 358 self.fields=fields |
258 # Make ONE big minibatch with all the examples, to separate the fields. | 359 rows, cols = data.shape |
259 n_examples=len(dataset) | 360 |
260 batch = dataset.minibatches(n_examples).next() | 361 if fields: |
261 # Each field of the underlying dataset must be convertible to a numpy array of the same type | 362 for fieldname,fieldslice in fields.items(): |
262 # currently just double, but should use the smallest compatible dtype | |
263 n_fields = len(batch) | |
264 fieldnames = batch.fields.keys() | |
265 total_width = 0 | |
266 type = None | |
267 for i in xrange(n_fields): | |
268 field = array(batch[i]) | |
269 assert field.shape[0]==n_examples | |
270 width = field.shape[1] | |
271 start=total_width | |
272 total_width += width | |
273 fields[fieldnames[i]]=slice(start,total_width,1) | |
274 # many complicated things remain to be done: | |
275 # - find common dtype | |
276 # - decide what to do with extra dimensions if not the same in all fields | |
277 # - try to see if we can avoid the copy? | |
278 raise NotImplementedError | |
279 if data!=None: | |
280 assert dataset==None | |
281 self.data=data | |
282 self.fields=fields | |
283 self.width = data.shape[1] | |
284 for fieldname in fields: | |
285 fieldslice=fields[fieldname] | |
286 # make sure fieldslice.start and fieldslice.step are defined | 363 # make sure fieldslice.start and fieldslice.step are defined |
287 start=fieldslice.start | 364 start=fieldslice.start |
288 step=fieldslice.step | 365 step=fieldslice.step |
289 if not start: | 366 if not start: |
290 start=0 | 367 start=0 |
291 if not step: | 368 if not step: |
292 step=1 | 369 step=1 |
293 if not fieldslice.start or not fieldslice.step: | 370 if not fieldslice.start or not fieldslice.step: |
294 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) | 371 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
295 # and coherent with the data array | 372 # and coherent with the data array |
296 assert fieldslice.start>=0 and fieldslice.stop<=self.width | 373 assert fieldslice.start >= 0 and fieldslice.stop <= cols |
374 | |
375 def minibatches(self, | |
376 fieldnames = DataSet.minibatches_fieldnames, | |
377 minibatch_size = DataSet.minibatches_minibatch_size, | |
378 n_batches = DataSet.minibatches_n_batches): | |
379 """ | |
380 If the fieldnames list is empty, it means that we want to see ALL the fields. | |
381 | |
382 If the n_batches is empty, we want to see all the examples possible | |
383 for the give minibatch_size. | |
384 """ | |
385 # substitute the defaults: | |
386 if fieldnames is None: fieldnames = self.fieldNames() | |
387 if n_batches is None: n_batches = len(self) / minibatch_size | |
388 return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches) | |
297 | 389 |
298 def __getattr__(self,fieldname): | 390 def __getattr__(self,fieldname): |
299 """ | 391 """ |
300 Return a numpy array with the content associated with the given field name. | 392 Return a numpy array with the content associated with the given field name. |
301 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension | 393 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension |
310 min_col=self.data.shape[1] | 402 min_col=self.data.shape[1] |
311 max_col=0 | 403 max_col=0 |
312 for field_slice in self.fields.values(): | 404 for field_slice in self.fields.values(): |
313 min_col=min(min_col,field_slice.start) | 405 min_col=min(min_col,field_slice.start) |
314 max_col=max(max_col,field_slice.stop) | 406 max_col=max(max_col,field_slice.stop) |
315 new_fields={} | 407 new_fields=LookupList() |
316 for field in self.fields: | 408 for fieldname,fieldslice in self.fields.items(): |
317 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) | 409 new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step) |
318 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) | 410 return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields) |
319 | 411 |
320 def fieldNames(self): | 412 def fieldNames(self): |
321 """Return the list of field names that are supported by getattr and getFields.""" | 413 """Return the list of field names that are supported by getattr and getFields.""" |
322 return self.fields.keys() | 414 return self.fields.keys() |
323 | 415 |
330 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields | 422 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields |
331 the result is just a numpy array (for the i-th row of the dataset data matrix). | 423 the result is just a numpy array (for the i-th row of the dataset data matrix). |
332 """ | 424 """ |
333 if self.fields: | 425 if self.fields: |
334 fieldnames,fieldslices=zip(*self.fields.items()) | 426 fieldnames,fieldslices=zip(*self.fields.items()) |
335 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) | 427 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()]) |
336 else: | 428 else: |
337 return self.data[i] | 429 return self.data[i] |
338 | 430 |
339 def __getslice__(self,*slice_args): | 431 def __getslice__(self,*args): |
340 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 432 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
341 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) | 433 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) |
342 | 434 |
343 def __array__(self): | 435 def __array__(self): |
344 """Return an view of this dataset which is an numpy.ndarray | 436 """Return an view of this dataset which is an numpy.ndarray |
345 | 437 |
346 Numpy uses this special function name to retrieve an ndarray view for | 438 Numpy uses this special function name to retrieve an ndarray view for |