Mercurial > pylearn
comparison dataset.py @ 16:813723310d75
commenting
author | bergstrj@iro.umontreal.ca |
---|---|
date | Wed, 26 Mar 2008 18:23:44 -0400 |
parents | 88168361a5ab be128b9127c8 |
children | 759d17112b23 |
comparison
equal
deleted
inserted
replaced
15:88168361a5ab | 16:813723310d75 |
---|---|
1 | |
2 class Example(object): | |
3 """ | |
4 An example is something that is like a tuple but whose elements can be named, to that | |
5 following syntactic constructions work as one would expect: | |
6 example.x = [1, 2, 3] # set a field | |
7 x, y, z = example | |
8 x = example[0] | |
9 x = example["x"] | |
10 """ | |
11 def __init__(self,names,values): | |
12 assert len(values)==len(names) | |
13 self.__dict__['values']=values | |
14 self.__dict__['fields']={} | |
15 for i in xrange(len(values)): | |
16 self.fields[names[i]]=i | |
17 | |
18 def __getitem__(self,i): | |
19 if isinstance(i,int): | |
20 return self.values[i] | |
21 else: | |
22 return self.values[self.fields[i]] | |
23 | |
24 def __setitem__(self,i,value): | |
25 if isinstance(i,int): | |
26 self.values[i]=value | |
27 else: | |
28 self.values[self.fields[i]]=value | |
29 | |
30 def __getattr__(self,name): | |
31 return self.values[self.fields[name]] | |
32 | |
33 def __setattr__(self,name,value): | |
34 self.values[self.fields[name]]=value | |
35 | |
36 def __len__(self): | |
37 return len(self.values) | |
1 | 38 |
2 | 39 |
3 class DataSet(object): | 40 class DataSet(object): |
4 """ | 41 """A virtual base class for datasets. |
5 This is a virtual base class or interface for datasets. | 42 |
6 A dataset is basically an iterator over examples. It does not necessarily | 43 A DataSet is a generator of iterators; these iterators can run through the |
7 have a fixed length (this is useful for 'streams' which feed on-line learning). | 44 examples in a variety of ways. A DataSet need not necessarily have a finite |
8 Datasets with fixed and known length are FiniteDataSet, a subclass of DataSet. | 45 or known length, so this class can be used to interface to a 'stream' which |
9 Examples and datasets optionally have named fields. | 46 feed on-line learning. |
10 One can obtain a sub-dataset by taking dataset.field or dataset(field1,field2,field3,...). | 47 |
11 Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. | 48 To iterate over examples, there are several possibilities: |
12 The content of a field can be of any type, but often will be a numpy array. | 49 - for i in dataset.zip(field1, field2,field3, ...) |
13 The minibatch_size attribute, if different than 1, means that the iterator (next() method) | 50 - for i in dataset.minibatches(N, field1, field2, ...) |
14 returns not a single example but an array of length minibatch_size, i.e., an indexable | 51 - for i in dataset |
15 object with minibatch_size examples in it. | 52 Each of these is documented below. |
16 """ | 53 |
17 | 54 Note: For a dataset of fixed and known length, which can implement item |
18 def __init__(self,minibatch_size=1): | 55 random-access efficiently (e.g. indexing and slicing), and which can profit |
19 assert minibatch_size>0 | 56 from the FiniteDataSetIterator, consider using base class FiniteDataSet. |
20 self.minibatch_size=minibatch_size | 57 |
21 | 58 Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content. |
59 | |
60 Note: The content of a field can be of any type. | |
61 | |
62 """ | |
63 | |
64 def __init__(self): | |
65 pass | |
66 | |
22 def __iter__(self): | 67 def __iter__(self): |
23 """ | 68 """Supports the syntax "for i in dataset: ..." |
69 | |
70 Using this syntax, "i" will be an Example instance (or equivalent) with | |
71 all the fields of DataSet self. Every field of "i" will give access to | |
72 a the field of a single example. Fields should be accessible via | |
73 i[identifier], but the derived class is free to accept any type of | |
74 identifier, and add extra functionality to the iterator. | |
75 """ | |
76 raise NotImplementedError | |
77 | |
78 def zip(self, *fieldnames): | |
79 """ | |
80 Supports two forms of syntax: | |
81 | |
82 for i in dataset.zip(f1, f2, f3): ... | |
83 | |
84 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... | |
85 | |
86 Using the first syntax, "i" will be an indexable object, such as a list, | |
87 tuple, or Example instance, such that on every iteration, i[0] is the f1 | |
88 field of the current example, i[1] is the f2 field, and so on. | |
89 | |
90 Using the second syntax, i1, i2, i3 will contain the the contents of the | |
91 f1, f2, and f3 fields of a single example on each loop iteration. | |
92 | |
93 The derived class may accept fieldname arguments of any type. | |
94 | |
95 """ | |
96 raise NotImplementedError | |
97 | |
98 def minibatches(self,minibatch_size,*fieldnames): | |
99 """ | |
100 Supports two forms of syntax: | |
101 | |
102 for i in dataset.zip(f1, f2, f3): ... | |
103 | |
104 for i1, i2, i3 in dataset.zip(f1, f2, f3): ... | |
105 | |
106 Using the first syntax, "i" will be an indexable object, such as a list, | |
107 tuple, or Example instance, such that on every iteration, i[0] is the f1 | |
108 field of the current example, i[1] is the f2 field, and so on. | |
109 | |
110 Using the second syntax, i1, i2, i3 will contain the the contents of the | |
111 f1, f2, and f3 fields of a single example on each loop iteration. | |
112 | |
113 The derived class may accept fieldname arguments of any type. | |
114 | |
24 Return an iterator, whose next() method returns the next example or the next | 115 Return an iterator, whose next() method returns the next example or the next |
25 minibatch in the dataset. A minibatch (of length > 1) should be something one | 116 minibatch in the dataset. A minibatch (of length > 1) is also an example, but |
26 can iterate on again in order to obtain the individual examples. If the dataset | 117 whose fields should be something one can iterate on again in order to obtain |
27 has fields, then the example or the minibatch must have the same fields | 118 the individual examples. |
28 (typically this is implemented by returning another smaller dataset, when | 119 |
29 there are fields). | 120 DataSet.zip returns an iterator over only the desired fields, and each field |
30 """ | 121 of the iterator contains one example. |
31 raise NotImplementedError | 122 |
32 | 123 Return an iterator which sees only the specified fields (each fieldname is a |
33 def __getattr__(self,fieldname): | 124 field key, typically a string). The value returned at each iteration |
34 """Return a sub-dataset containing only the given fieldname as field.""" | 125 is a tuple with one element per field. Hence it can be used like this: |
35 return self(fieldname) | 126 for f1, f2, f3 in dataset.zip('field1','field2','field3'): |
36 | 127 ... use f1, f2, and f3 |
37 def __call__(self,*fieldnames): | 128 If one iterates through minibatches of examples (with the minibatches() method |
38 """Return a sub-dataset containing only the given fieldnames as fields.""" | 129 or with the minibatch_size argument of the zip() method), then the fields |
39 raise NotImplementedError | 130 returned by the iterator's next method should be iterators over the |
40 | 131 individual values within the minibatch (typically these will be arrays |
132 with minibatch_size rows). | |
133 Similar to zip but iterate over minibatches. | |
134 Return a minibatch iterator, whose next() method returns an 'example' | |
135 whose fields are iteratable objects (which can iterate over the individual | |
136 values of that field in the minibatch). | |
137 """ | |
138 raise NotImplementedError | |
139 | |
41 def fieldNames(self): | 140 def fieldNames(self): |
42 """Return the list of field names that are supported by getattr and getFields.""" | 141 """Return the list of field names in the examples of this dataset.""" |
142 raise NotImplementedError | |
143 | |
144 def rename(*new_field_specifications): | |
145 """ | |
146 Return a new dataset that maps old fields (of self) to new fields (of the returned | |
147 dataset). The minimal syntax that should be supported is the following: | |
148 new_field_specifications = [new_field_spec1, new_field_spec2, ...] | |
149 new_field_spec = ([old_field1, old_field2, ...], new_field) | |
150 In general both old_field and new_field should be strings, but some datasets may also | |
151 support additional indexing schemes within each field (e.g. column slice | |
152 of a matrix-like field). | |
153 """ | |
43 raise NotImplementedError | 154 raise NotImplementedError |
44 | 155 |
45 class FiniteDataSet(DataSet): | 156 class FiniteDataSet(DataSet): |
46 """ | 157 """ |
47 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. | 158 Virtual interface, a subclass of DataSet for datasets which have a finite, known length. |
48 Examples are indexed by an integer between 0 and self.length()-1, | 159 Examples are indexed by an integer between 0 and self.length()-1, |
49 and a subdataset can be obtained by slicing. | 160 and a subdataset can be obtained by slicing. This may not be appropriate in general |
50 """ | 161 but only for datasets which can be thought of like ones that access rows AND fields |
51 | 162 in an efficient random access way. Users are encouraged to expect only the generic dataset |
52 def __init__(self,minibatch_size): | 163 interface in general. A FiniteDataSet is mainly useful when one has to obtain |
53 DataSet.__init__(self,minibatch_size) | 164 a subset of examples (e.g. for splitting a dataset into training and test sets). |
165 """ | |
166 | |
167 def __init__(self): | |
168 pass | |
54 | 169 |
55 def __iter__(self): | 170 def __iter__(self): |
56 return FiniteDataSetIterator(self) | 171 return FiniteDataSetIterator(self) |
57 | 172 |
173 def zip(self,*fieldnames): | |
174 return FiniteDataSetIterator(self,1,fieldnames) | |
175 | |
176 def minibatches(self,minibatch_size,*fieldnames): | |
177 return FiniteDataSetIterator(self,minibatch_size,fieldnames) | |
178 | |
179 def __getattr__(self,fieldname): | |
180 """Return an that can iterate over the values of the field in this dataset.""" | |
181 return self(fieldname) | |
182 | |
183 def __call__(self,*fieldnames): | |
184 """Return a sub-dataset containing only the given fieldnames as fields. | |
185 | |
186 The return value's default iterator will iterate only over the given | |
187 fields. | |
188 """ | |
189 raise NotImplementedError | |
190 | |
58 def __len__(self): | 191 def __len__(self): |
59 """len(dataset) returns the number of examples in the dataset.""" | 192 """len(dataset) returns the number of examples in the dataset.""" |
60 raise NotImplementedError | 193 raise NotImplementedError |
61 | 194 |
62 def __getitem__(self,i): | 195 def __getitem__(self,i): |
66 def __getslice__(self,*slice_args): | 199 def __getslice__(self,*slice_args): |
67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 200 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
68 raise NotImplementedError | 201 raise NotImplementedError |
69 | 202 |
70 class FiniteDataSetIterator(object): | 203 class FiniteDataSetIterator(object): |
71 def __init__(self,dataset): | 204 """ |
205 If the fieldnames list is empty, it means that we want to see ALL the fields. | |
206 """ | |
207 def __init__(self,dataset,minibatch_size=1,fieldnames=[]): | |
72 self.dataset=dataset | 208 self.dataset=dataset |
73 self.current = -self.dataset.minibatch_size | 209 self.minibatch_size=minibatch_size |
74 | 210 assert minibatch_size>=1 and minibatch_size<=len(dataset) |
211 self.current = -self.minibatch_size | |
212 self.fieldnames = fieldnames | |
213 | |
214 def __iter__(self): | |
215 return self | |
216 | |
75 def next(self): | 217 def next(self): |
76 """ | 218 self.current+=self.minibatch_size |
77 Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that | |
78 many examples. If the dataset has fields, the example or the minibatch of examples | |
79 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), | |
80 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate | |
81 example-wise on it. On the other hand, if the dataset has no fields (e.g. because | |
82 it is already the field of a bigger dataset), then the returned example or minibatch | |
83 may be any indexable object, such as a numpy array. Following the array semantics of indexing | |
84 and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array | |
85 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding | |
86 to a row. Again, if the minibatch_size is >1, one can iterate on the result to | |
87 obtain individual examples (as rows). | |
88 """ | |
89 self.current+=self.dataset.minibatch_size | |
90 if self.current>=len(self.dataset): | 219 if self.current>=len(self.dataset): |
91 self.current=-self.dataset.minibatch_size | 220 self.current=-self.minibatch_size |
92 raise StopIteration | 221 raise StopIteration |
93 if self.dataset.minibatch_size==1: | 222 if self.minibatch_size==1: |
94 return self.dataset[self.current] | 223 complete_example=self.dataset[self.current] |
95 else: | 224 else: |
96 return self.dataset[self.current:self.current+self.dataset.minibatch_size] | 225 complete_example=self.dataset[self.current:self.current+self.minibatch_size] |
226 if self.fieldnames: | |
227 return Example(self.fieldnames,list(complete_example)) | |
228 else: | |
229 return complete_example | |
97 | 230 |
98 | 231 |
99 # we may want ArrayDataSet defined in another python file | 232 # we may want ArrayDataSet defined in another python file |
100 | 233 |
101 import numpy | 234 import numpy |
102 | 235 |
103 class ArrayDataSet(FiniteDataSet): | 236 class ArrayDataSet(FiniteDataSet): |
104 """ | 237 """ |
105 An ArrayDataSet behaves like a numpy array but adds the notion of fields | 238 An ArrayDataSet behaves like a numpy array but adds the notion of named fields |
106 and minibatch_size from DataSet. It is a fixed-length and fixed-width dataset | 239 from DataSet (and the ability to view multiple field values as an 'Example'). |
240 It is a fixed-length and fixed-width dataset | |
107 in which each element is a numpy array or a number, hence the whole | 241 in which each element is a numpy array or a number, hence the whole |
108 dataset corresponds to a numpy array. Fields | 242 dataset corresponds to a numpy array. Fields |
109 must correspond to a slice of array columns. If the dataset has fields, | 243 must correspond to a slice of array columns. If the dataset has fields, |
110 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. | 244 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. |
111 Any dataset can also be converted to a numpy array (losing the notion of fields | 245 Any dataset can also be converted to a numpy array (losing the notion of fields |
112 and of minibatch_size) by the numpy.array(dataset) call. | 246 by the numpy.array(dataset) call. |
113 """ | 247 """ |
114 | 248 |
115 def __init__(self,dataset=None,data=None,fields={},minibatch_size=1): | 249 def __init__(self,dataset=None,data=None,fields={}): |
116 """ | 250 """ |
117 There are two ways to construct an ArrayDataSet: (1) from an | 251 There are two ways to construct an ArrayDataSet: (1) from an |
118 existing dataset (which may result in a copy of the data in a numpy array), | 252 existing dataset (which may result in a copy of the data in a numpy array), |
119 or (2) from a numpy.array (the data argument), along with an optional description | 253 or (2) from a numpy.array (the data argument), along with an optional description |
120 of the fields (dictionary of column slices indexed by field names). | 254 of the fields (dictionary of column slices indexed by field names). |
121 """ | 255 """ |
122 FiniteDataSet.__init__(self,minibatch_size) | |
123 if dataset!=None: | 256 if dataset!=None: |
124 assert data==None and fields=={} | 257 assert data==None and fields=={} |
125 # convert dataset to an ArrayDataSet | 258 # Make ONE big minibatch with all the examples, to separate the fields. |
259 n_examples=len(dataset) | |
260 batch = dataset.minibatches(n_examples).next() | |
261 # Each field of the underlying dataset must be convertible to a numpy array of the same type | |
262 # currently just double, but should use the smallest compatible dtype | |
263 n_fields = len(batch) | |
264 fieldnames = batch.fields.keys() | |
265 total_width = 0 | |
266 type = None | |
267 for i in xrange(n_fields): | |
268 field = array(batch[i]) | |
269 assert field.shape[0]==n_examples | |
270 width = field.shape[1] | |
271 start=total_width | |
272 total_width += width | |
273 fields[fieldnames[i]]=slice(start,total_width,1) | |
274 # many complicated things remain to be done: | |
275 # - find common dtype | |
276 # - decide what to do with extra dimensions if not the same in all fields | |
277 # - try to see if we can avoid the copy? | |
126 raise NotImplementedError | 278 raise NotImplementedError |
127 if data!=None: | 279 if data!=None: |
128 assert dataset==None | 280 assert dataset==None |
129 self.data=data | 281 self.data=data |
130 self.fields=fields | 282 self.fields=fields |
140 step=1 | 292 step=1 |
141 if not fieldslice.start or not fieldslice.step: | 293 if not fieldslice.start or not fieldslice.step: |
142 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) | 294 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
143 # and coherent with the data array | 295 # and coherent with the data array |
144 assert fieldslice.start>=0 and fieldslice.stop<=self.width | 296 assert fieldslice.start>=0 and fieldslice.stop<=self.width |
145 assert minibatch_size<=len(self.data) | |
146 | 297 |
147 def __getattr__(self,fieldname): | 298 def __getattr__(self,fieldname): |
148 """ | 299 """ |
149 Return a numpy array with the content associated with the given field name. | 300 Return a numpy array with the content associated with the given field name. |
150 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension | 301 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension |
151 than the dataset.data) is returned. | 302 than the dataset itself) is returned. |
152 """ | 303 """ |
153 if len(self.data)==1: | 304 if len(self.data)==1: |
154 return self.data[0,self.fields[fieldname]] | 305 return self.data[0,self.fields[fieldname]] |
155 return self.data[:,self.fields[fieldname]] | 306 return self.data[:,self.fields[fieldname]] |
156 | 307 |
162 min_col=min(min_col,field_slice.start) | 313 min_col=min(min_col,field_slice.start) |
163 max_col=max(max_col,field_slice.stop) | 314 max_col=max(max_col,field_slice.stop) |
164 new_fields={} | 315 new_fields={} |
165 for field in self.fields: | 316 for field in self.fields: |
166 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) | 317 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) |
167 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields,minibatch_size=self.minibatch_size) | 318 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) |
168 | 319 |
169 def fieldNames(self): | 320 def fieldNames(self): |
170 """Return the list of field names that are supported by getattr and getFields.""" | 321 """Return the list of field names that are supported by getattr and getFields.""" |
171 return self.fields.keys() | 322 return self.fields.keys() |
172 | 323 |
174 """len(dataset) returns the number of examples in the dataset.""" | 325 """len(dataset) returns the number of examples in the dataset.""" |
175 return len(self.data) | 326 return len(self.data) |
176 | 327 |
177 def __getitem__(self,i): | 328 def __getitem__(self,i): |
178 """ | 329 """ |
179 dataset[i] returns the (i+1)-th example of the dataset. If the dataset has fields | 330 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields |
180 then a one-example dataset is returned (to be able to handle example.field accesses). | 331 the result is just a numpy array (for the i-th row of the dataset data matrix). |
181 """ | 332 """ |
182 if self.fields: | 333 if self.fields: |
183 if isinstance(i,slice): | 334 fieldnames,fieldslices=zip(*self.fields.items()) |
184 return ArrayDataSet(data=data[slice],fields=self.fields) | 335 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) |
185 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) | |
186 else: | 336 else: |
187 return self.data[i] | 337 return self.data[i] |
188 | 338 |
189 def __getslice__(self,*slice_args): | 339 def __getslice__(self,*slice_args): |
190 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 340 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
239 slice_width=field_slice.stop-field_slice.start/field_slice.step | 389 slice_width=field_slice.stop-field_slice.start/field_slice.step |
240 # copy the field here | 390 # copy the field here |
241 result[:,slice(c,slice_width)]=self.data[:,field_slice] | 391 result[:,slice(c,slice_width)]=self.data[:,field_slice] |
242 c+=slice_width | 392 c+=slice_width |
243 return result | 393 return result |
394 | |
395 |