comparison dataset.py @ 28:541a273bc89f

Removed __array__ method from dataset, whose semantics did not have a clear use (because of the possibility of overlapping fields).
author bengioy@grenat.iro.umontreal.ca
date Fri, 11 Apr 2008 13:08:51 -0400
parents 672fe4b23032
children 46c5c90019c2
comparison
equal deleted inserted replaced
27:e6c550cb2896 28:541a273bc89f
35 values can be computed on-demand, when particular field names are used in one of the 35 values can be computed on-demand, when particular field names are used in one of the
36 iterators). 36 iterators).
37 37
38 Datasets of finite length should be sub-classes of FiniteLengthDataSet. 38 Datasets of finite length should be sub-classes of FiniteLengthDataSet.
39 39
40 Datasets whose elements can be indexed and sub-datasets of consecutive 40 Datasets whose elements can be indexed and whose sub-datasets (with a subset
41 examples (i.e. slices) can be extracted from should be sub-classes of 41 of examples) can be extracted should be sub-classes of
42 SliceableDataSet. 42 SliceableDataSet.
43 43
44 Datasets with a finite number of fields should be sub-classes of 44 Datasets with a finite number of fields should be sub-classes of
45 FiniteWidthDataSet. 45 FiniteWidthDataSet.
46 """ 46 """
228 class SliceableDataSet(DataSet): 228 class SliceableDataSet(DataSet):
229 """ 229 """
230 Virtual interface, a subclass of DataSet for datasets which are sliceable 230 Virtual interface, a subclass of DataSet for datasets which are sliceable
231 and whose individual elements can be accessed, generally respecting the 231 and whose individual elements can be accessed, generally respecting the
232 python semantics for [spec], where spec is either a non-negative integer 232 python semantics for [spec], where spec is either a non-negative integer
233 (for selecting one example), or a python slice (for selecting a sub-dataset 233 (for selecting one example), a python slice(start,stop,step) for selecting a regular
234 comprising the specified examples). This is useful for obtaining 234 sub-dataset comprising examples start,start+step,start+2*step,...,n (with n<stop), or a
235 sequence (e.g. a list) of integers [i1,i2,...,in] for selecting
236 an arbitrary subset of examples. This is useful for obtaining
235 sub-datasets, e.g. for splitting a dataset into training and test sets. 237 sub-datasets, e.g. for splitting a dataset into training and test sets.
236 """ 238 """
237 def __init__(self): 239 def __init__(self):
238 DataSet.__init__(self) 240 DataSet.__init__(self)
239 241
248 # substitute the defaults: 250 # substitute the defaults:
249 if n_batches is None: n_batches = len(self) / minibatch_size 251 if n_batches is None: n_batches = len(self) / minibatch_size
250 return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches) 252 return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
251 253
252 def __getitem__(self,i): 254 def __getitem__(self,i):
253 """dataset[i] returns the (i+1)-th example of the dataset.""" 255 """
256 dataset[i] returns the (i+1)-th example of the dataset.
257 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
258 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
259 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
260 """
254 raise AbstractFunction() 261 raise AbstractFunction()
255 262
256 def __getslice__(self,*slice_args): 263 def __getslice__(self,*slice_args):
257 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 264 """
265 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
266 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
267 """
258 raise AbstractFunction() 268 raise AbstractFunction()
259 269
260 270
261 class FiniteWidthDataSet(DataSet): 271 class FiniteWidthDataSet(DataSet):
262 """ 272 """
346 An ArrayDataSet behaves like a numpy array but adds the notion of named fields 356 An ArrayDataSet behaves like a numpy array but adds the notion of named fields
347 from DataSet (and the ability to view the values of multiple fields as an 'Example'). 357 from DataSet (and the ability to view the values of multiple fields as an 'Example').
348 It is a fixed-length and fixed-width dataset 358 It is a fixed-length and fixed-width dataset
349 in which each element is a fixed dimension numpy array or a number, hence the whole 359 in which each element is a fixed dimension numpy array or a number, hence the whole
350 dataset corresponds to a numpy array. Fields 360 dataset corresponds to a numpy array. Fields
351 must correspond to a slice of array columns. If the dataset has fields, 361 must correspond to a slice of array columns or to a list of column numbers.
362 If the dataset has fields,
352 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array. 363 each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
353 Any dataset can also be converted to a numpy array (losing the notion of fields 364 Any dataset can also be converted to a numpy array (losing the notion of fields
354 by the numpy.array(dataset) call. 365 by the numpy.array(dataset) call.
355 """ 366 """
356 367
394 #check for end-of-loop 405 #check for end-of-loop
395 self.next_count += 1 406 self.next_count += 1
396 if self.next_count == self.next_max: 407 if self.next_count == self.next_max:
397 raise StopIteration 408 raise StopIteration
398 409
399 #determine the first and last elements of the slice we'll return 410 #determine the first and last elements of the minibatch slice we'll return
400 n_rows = self.dataset.data.shape[0] 411 n_rows = self.dataset.data.shape[0]
401 self.current = self.next_index() 412 self.current = self.next_index()
402 upper = self.current + self.minibatch_size 413 upper = self.current + self.minibatch_size
403 414
404 data = self.dataset.data 415 data = self.dataset.data
421 def __init__(self, data, fields=None): 432 def __init__(self, data, fields=None):
422 """ 433 """
423 There are two ways to construct an ArrayDataSet: (1) from an 434 There are two ways to construct an ArrayDataSet: (1) from an
424 existing dataset (which may result in a copy of the data in a numpy array), 435 existing dataset (which may result in a copy of the data in a numpy array),
425 or (2) from a numpy.array (the data argument), along with an optional description 436 or (2) from a numpy.array (the data argument), along with an optional description
426 of the fields (a LookupList of column slices indexed by field names). 437 of the fields (a LookupList of column slices (or column lists) indexed by field names).
427 """ 438 """
428 self.data=data 439 self.data=data
429 self.fields=fields 440 self.fields=fields
430 rows, cols = data.shape 441 rows, cols = data.shape
431 442
432 if fields: 443 if fields:
433 for fieldname,fieldslice in fields.items(): 444 for fieldname,fieldslice in fields.items():
434 # make sure fieldslice.start and fieldslice.step are defined 445 assert type(fieldslice) is int or isinstance(fieldslice,slice) or hasattr(fieldslice,"__iter__")
435 start=fieldslice.start 446 if hasattr(fieldslice,"__iter__"): # is a sequence
436 step=fieldslice.step 447 for i in fieldslice:
437 if not start: 448 assert type(i) is int
438 start=0 449 elif isinstance(fieldslice,slice):
439 if not step: 450 # make sure fieldslice.start and fieldslice.step are defined
440 step=1 451 start=fieldslice.start
441 if not fieldslice.start or not fieldslice.step: 452 step=fieldslice.step
442 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) 453 if not start:
443 # and coherent with the data array 454 start=0
444 assert fieldslice.start >= 0 and fieldslice.stop <= cols 455 if not step:
456 step=1
457 if not fieldslice.start or not fieldslice.step:
458 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
459 # and coherent with the data array
460 assert fieldslice.start >= 0 and fieldslice.stop <= cols
445 461
446 def minibatches(self, 462 def minibatches(self,
447 fieldnames = DataSet.minibatches_fieldnames, 463 fieldnames = DataSet.minibatches_fieldnames,
448 minibatch_size = DataSet.minibatches_minibatch_size, 464 minibatch_size = DataSet.minibatches_minibatch_size,
449 n_batches = DataSet.minibatches_n_batches): 465 n_batches = DataSet.minibatches_n_batches):
467 return self.data[0,self.fields[fieldname]] 483 return self.data[0,self.fields[fieldname]]
468 return self.data[:,self.fields[fieldname]] 484 return self.data[:,self.fields[fieldname]]
469 485
470 def __call__(self,*fieldnames): 486 def __call__(self,*fieldnames):
471 """Return a sub-dataset containing only the given fieldnames as fields.""" 487 """Return a sub-dataset containing only the given fieldnames as fields."""
472 min_col=self.data.shape[1] 488 return ArrayDataSet(self.data,fields=LookupList(fieldnames,[self.fields[fieldname] for fieldname in fieldnames]))
473 max_col=0
474 for field_slice in self.fields.values():
475 min_col=min(min_col,field_slice.start)
476 max_col=max(max_col,field_slice.stop)
477 new_fields=LookupList()
478 for fieldname,fieldslice in self.fields.items():
479 new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
480 return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
481 489
482 def fieldNames(self): 490 def fieldNames(self):
483 """Return the list of field names that are supported by getattr and hasField.""" 491 """Return the list of field names that are supported by getattr and hasField."""
484 return self.fields.keys() 492 return self.fields.keys()
485 493
487 """len(dataset) returns the number of examples in the dataset.""" 495 """len(dataset) returns the number of examples in the dataset."""
488 return len(self.data) 496 return len(self.data)
489 497
490 def __getitem__(self,i): 498 def __getitem__(self,i):
491 """ 499 """
492 dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields 500 dataset[i] returns the (i+1)-th Example of the dataset.
493 the result is just a numpy array (for the i-th row of the dataset data matrix). 501 If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix).
502 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
503 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
504 dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
494 """ 505 """
495 if self.fields: 506 if self.fields:
496 fieldnames,fieldslices=zip(*self.fields.items()) 507 fieldnames,fieldslices=zip(*self.fields.items())
497 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()]) 508 return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
498 else: 509 else:
499 return self.data[i] 510 return self.data[i]
500 511
501 def __getslice__(self,*args): 512 def __getslice__(self,*args):
502 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 513 """
514 dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
515 dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
516 """
503 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields) 517 return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
504 518
505 def __array__(self): 519 def indices_of_unique_columns_used(self):
506 """Return a view of this dataset which is an numpy.ndarray (i.e. losing 520 """
507 the identity and name of fields within the dataset). 521 Return the unique indices of the columns actually used by the fields, and a boolean
508 522 that signals (if True) that used columns overlap. If they do then the
509 Numpy uses this special function name to retrieve an ndarray view for 523 indices are not repeated in the result.
510 function such as numpy.sum, numpy.dot, numpy.asarray, etc. 524 """
511
512 If this dataset has no fields, then we simply return self.data,
513 otherwise things are complicated.
514 - why do we want this behaviour when there are fields? (JB)
515 - for convenience and completeness (but maybe it would make
516 more sense to implement this through a 'field-merging'
517 dataset). (YB)
518 """
519 if not self.fields:
520 return self.data
521 # else, select subsets of columns mapped by the fields
522 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) 525 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
523 overlapping_fields = False 526 overlapping_columns = False
524 n_columns = 0
525 for field_slice in self.fields.values(): 527 for field_slice in self.fields.values():
526 for c in xrange(field_slice.start,field_slice.stop,field_slice.step): 528 if sum(columns_used[field_slice])>0: overlapping_columns=True
527 n_columns += 1 529 columns_used[field_slice]=True
528 if columns_used[c]: overlapping_fields=True 530 return [i for i,used in enumerate(columns_used) if used],overlapping_columns
529 columns_used[c]=True 531
530 # try to figure out if we can map all the slices into one slice: 532 def slice_of_unique_columns_used(self):
531 mappable_to_one_slice = not overlapping_fields 533 """
534 Return None if the indices_of_unique_columns_used do not form a slice. If they do,
535 return that slice. It means that the columns used can be extracted
536 from the data array without making a copy. If the fields overlap
537 but their unique columns used form a slice, still return that slice.
538 """
539 columns_used,overlapping_columns = self.indices_of_columns_used()
540 mappable_to_one_slice = True
532 if not overlapping_fields: 541 if not overlapping_fields:
533 start=0 542 start=0
534 while start<len(columns_used) and not columns_used[start]: 543 while start<len(columns_used) and not columns_used[start]:
535 start+=1 544 start+=1
536 stop=len(columns_used) 545 stop=len(columns_used)
547 mappable_to_one_slice = False 556 mappable_to_one_slice = False
548 break 557 break
549 else: 558 else:
550 step = j-i 559 step = j-i
551 i=j 560 i=j
552 if mappable_to_one_slice: 561 return slice(start,stop,step)
553 return self.data[:,slice(start,stop,step)] 562
554 # else make contiguous copy (copying the overlapping columns)
555 result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
556 c=0
557 for field_slice in self.fields.values():
558 slice_width=(field_slice.stop-field_slice.start)/field_slice.step
559 # copy the field here
560 result[:,slice(c,c+slice_width)]=self.data[:,field_slice]
561 c+=slice_width
562 return result
563
564 class ApplyFunctionDataSet(DataSet): 563 class ApplyFunctionDataSet(DataSet):
565 """ 564 """
566 A dataset that contains as fields the results of applying 565 A dataset that contains as fields the results of applying
567 a given function (example-wise) to specified input_fields of a source 566 a given function (example-wise) to specified input_fields of a source
568 dataset. The function should return a sequence whose elements will be stored in 567 dataset. The function should return a sequence whose elements will be stored in