comparison dataset.py @ 12:ff4e551490f1

Added LookupList type in lookup_list.py and used it to keep order of field names in Example in ArrayDataSet. Example is now just = LookupList.
author bengioy@esprit.iro.umontreal.ca
date Wed, 26 Mar 2008 18:21:57 -0400
parents be128b9127c8
children 759d17112b23
comparison
equal deleted inserted replaced
11:be128b9127c8 12:ff4e551490f1
1 1
2 class Example(object): 2 from lookup_list import LookupList
3 """ 3 Example = LookupList
4 An example is something that is like a tuple but whose elements can be named, to that 4
5 following syntactic constructions work as one would expect:
6 example.x = [1, 2, 3] # set a field
7 x, y, z = example
8 x = example[0]
9 x = example["x"]
10 """
11 def __init__(self,names,values):
12 assert len(values)==len(names)
13 self.__dict__['values']=values
14 self.__dict__['fields']={}
15 for i in xrange(len(values)):
16 self.fields[names[i]]=i
17
18 def __getitem__(self,i):
19 if isinstance(i,int):
20 return self.values[i]
21 else:
22 return self.values[self.fields[i]]
23
24 def __setitem__(self,i,value):
25 if isinstance(i,int):
26 self.values[i]=value
27 else:
28 self.values[self.fields[i]]=value
29
30 def __getattr__(self,name):
31 return self.values[self.fields[name]]
32
33 def __setattr__(self,name,value):
34 self.values[self.fields[name]]=value
35
36 def __len__(self):
37 return len(self.values)
38
39
40 class DataSet(object): 5 class DataSet(object):
41 """ 6 """
42 This is a virtual base class or interface for datasets. 7 This is a virtual base class or interface for datasets.
43 A dataset is basically an iterator over Examples (or anything that 8 A dataset is basically an iterator over Examples (or anything that
44 behaves like an Example). It does not necessarily 9 behaves like an Example). It does not necessarily
190 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. 155 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
191 Any dataset can also be converted to a numpy array (losing the notion of fields 156 Any dataset can also be converted to a numpy array (losing the notion of fields
192 by the numpy.array(dataset) call. 157 by the numpy.array(dataset) call.
193 """ 158 """
194 159
195 def __init__(self,dataset=None,data=None,fields={}): 160 def __init__(self,dataset=None,data=None,fields=None):
196 """ 161 """
197 There are two ways to construct an ArrayDataSet: (1) from an 162 There are two ways to construct an ArrayDataSet: (1) from an
198 existing dataset (which may result in a copy of the data in a numpy array), 163 existing dataset (which may result in a copy of the data in a numpy array),
199 or (2) from a numpy.array (the data argument), along with an optional description 164 or (2) from a numpy.array (the data argument), along with an optional description
200 of the fields (dictionary of column slices indexed by field names). 165 of the fields (a LookupList of column slices indexed by field names).
201 """ 166 """
202 if dataset!=None: 167 if dataset!=None:
203 assert data==None and fields=={} 168 assert data==None and fields==None
204 # Make ONE big minibatch with all the examples, to separate the fields. 169 # Make ONE big minibatch with all the examples, to separate the fields.
205 n_examples=len(dataset) 170 n_examples=len(dataset)
206 batch = dataset.minibatches(n_examples).next() 171 batch = dataset.minibatches(n_examples).next()
207 # Each field of the underlying dataset must be convertible to a numpy array of the same type 172 # Each field of the underlying dataset must be convertible to a numpy array of the same type
208 # currently just double, but should use the smallest compatible dtype 173 # currently just double, but should use the smallest compatible dtype
209 n_fields = len(batch) 174 n_fields = len(batch)
210 fieldnames = batch.fields.keys() 175 fieldnames = batch.fields.keys()
211 total_width = 0 176 total_width = 0
212 type = None 177 type = None
178 fields = LookupList()
213 for i in xrange(n_fields): 179 for i in xrange(n_fields):
214 field = array(batch[i]) 180 field = array(batch[i])
215 assert field.shape[0]==n_examples 181 assert field.shape[0]==n_examples
216 width = field.shape[1] 182 width = field.shape[1]
217 start=total_width 183 start=total_width
225 if data!=None: 191 if data!=None:
226 assert dataset==None 192 assert dataset==None
227 self.data=data 193 self.data=data
228 self.fields=fields 194 self.fields=fields
229 self.width = data.shape[1] 195 self.width = data.shape[1]
230 for fieldname in fields: 196 if fields:
231 fieldslice=fields[fieldname] 197 for fieldname,fieldslice in fields.items():
232 # make sure fieldslice.start and fieldslice.step are defined 198 # make sure fieldslice.start and fieldslice.step are defined
233 start=fieldslice.start 199 start=fieldslice.start
234 step=fieldslice.step 200 step=fieldslice.step
235 if not start: 201 if not start:
236 start=0 202 start=0
237 if not step: 203 if not step:
238 step=1 204 step=1
239 if not fieldslice.start or not fieldslice.step: 205 if not fieldslice.start or not fieldslice.step:
240 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) 206 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
241 # and coherent with the data array 207 # and coherent with the data array
242 assert fieldslice.start>=0 and fieldslice.stop<=self.width 208 assert fieldslice.start>=0 and fieldslice.stop<=self.width
243 209
244 def __getattr__(self,fieldname): 210 def __getattr__(self,fieldname):
245 """ 211 """
246 Return a numpy array with the content associated with the given field name. 212 Return a numpy array with the content associated with the given field name.
247 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension 213 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
256 min_col=self.data.shape[1] 222 min_col=self.data.shape[1]
257 max_col=0 223 max_col=0
258 for field_slice in self.fields.values(): 224 for field_slice in self.fields.values():
259 min_col=min(min_col,field_slice.start) 225 min_col=min(min_col,field_slice.start)
260 max_col=max(max_col,field_slice.stop) 226 max_col=max(max_col,field_slice.stop)
261 new_fields={} 227 new_fields=LookupList()
262 for field in self.fields: 228 for fieldname,fieldslice in self.fields.items():
263 new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step) 229 new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
264 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields) 230 return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)
265 231
266 def fieldNames(self): 232 def fieldNames(self):
267 """Return the list of field names that are supported by getattr and getFields.""" 233 """Return the list of field names that are supported by getattr and getFields."""
268 return self.fields.keys() 234 return self.fields.keys()
276 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields 242 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
277 the result is just a numpy array (for the i-th row of the dataset data matrix). 243 the result is just a numpy array (for the i-th row of the dataset data matrix).
278 """ 244 """
279 if self.fields: 245 if self.fields:
280 fieldnames,fieldslices=zip(*self.fields.items()) 246 fieldnames,fieldslices=zip(*self.fields.items())
281 return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices]) 247 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
282 else: 248 else:
283 return self.data[i] 249 return self.data[i]
284 250
285 def __getslice__(self,*slice_args): 251 def __getslice__(self,*slice_args):
286 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 252 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""