pylearn: dataset.py comparison

comparison dataset.py @ 321:f03ae06fadc8

NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes

author	Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
date	Thu, 12 Jun 2008 12:35:47 -0400
parents	b1da46b9b901
children	ad8be93b3c55

comparison

equal deleted inserted replaced

-:b1da46b9b901
+:f03ae06fadc8
 self.description =  default_desc() if description is None \
 else description
 self._attribute_names = ["description"]
+# create dictionnary of fieldnames index
+self.map_field_idx = dict()
+for k in len(range(self.fieldNames())):
+map_field_idx[ self.fieldNames[k] ] = k
 attributeNames = property(lambda self: copy.copy(self._attribute_names))
 def __contains__(self, fieldname):
 return (fieldname in self.fieldNames()) \
 or (fieldname in self.attributeNames())
 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
 of fieldnames. The number of arrays must be the same as the number of
 fieldnames. Each set of numpy tensor must have the same first dimension (first
 axis) corresponding to the number of examples.
-Every tensor is treated as a numpy matrix (using numpy.asmatrix)
+Every tensor is treated as a numpy array (using numpy.asarray)
 """
 ArrayFieldsDataSet.__init__(self,**kwargs)
 assert len(data_arrays) == len(fieldnames)
 assert len(fieldnames) > 0
-all_matrix_sizes = map(lambda x : numpy.asmatrix(x).shape[0] , data_arrays)
+ndarrays = [numpy.ndarray(a) for a in data_arrays]
-num_examples = max(all_matrix_sizes)
+lens = [a.shape[0] for a in ndarrays]
-if num_examples == 1 :
+num_examples = lens[0] #they must all be equal anyway
-# problem, do we transpose all arrays? is there only one example?
-raise Exception("wrong initialization, unknow behaviour with 1-d arrays")
 self._fieldnames = fieldnames
 self._datas = []
-for k in range(len(data_arrays)) :
+for k in self.ndarrays :
-self._datas.append( numpy.asmatrix(data_arrays[k]) )
+assert k.shape[0] == num_examples
-if self._datas[-1].shape[0] == 1 and self._datas[-1].shape[1] == num_examples :
+self._datas = ndarrays
-self._datas[-1] = self._datas[-1].transpose()
+# create dict
-for k in range(len(self._datas)) :
+self.map_field_idx = dict()
-assert self._datas[k].shape[0] == num_examples
+for k in range(len(fieldnames)):
+self.map_field_idx[fieldnames[k]] = k
 def __len__(self) :
 """
 Length of the dataset is based on the first array = data_arrays[0], using its shape
 def fieldNames(self) :
 """
 Returns the fieldnames as set in self.__init__
 """
 return self._fieldnames
+def field_pos(self,fieldname) :
+"""
+Returns the index of a given fieldname. Fieldname must exists! see fieldNames().
+"""
+return self.map_field_idx[fieldname]
 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
 cursor = Example(fieldnames,[0]*len(fieldnames))
 fieldnames = self.fieldNames() if fieldnames is None else fieldnames
 for n in xrange(n_batches):
 if offset == len(self):
 break
 for f in range(len(cursor._names)) :
-idx = self._fieldnames.index(cursor._names[f])
+idx = self.field_pos(cursor._names[f])
-assert idx >= 0
+sub_data = self._datas[idx][offset : offset+minibatch_size]
-sub_data = self._datas[f][offset : offset+minibatch_size]
 cursor._values[f] = sub_data
 offset += len(sub_data) #can be less than minibatch_size at end
 yield cursor
 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)

Mercurial > pylearn

comparison dataset.py @ 321:f03ae06fadc8