comparison dataset.py @ 321:f03ae06fadc8

NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
author Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
date Thu, 12 Jun 2008 12:35:47 -0400
parents b1da46b9b901
children ad8be93b3c55
comparison
equal deleted inserted replaced
320:b1da46b9b901 321:f03ae06fadc8
190 190
191 self.description = default_desc() if description is None \ 191 self.description = default_desc() if description is None \
192 else description 192 else description
193 self._attribute_names = ["description"] 193 self._attribute_names = ["description"]
194 194
195 # create dictionnary of fieldnames index
196 self.map_field_idx = dict()
197 for k in len(range(self.fieldNames())):
198 map_field_idx[ self.fieldNames[k] ] = k
199
200
195 attributeNames = property(lambda self: copy.copy(self._attribute_names)) 201 attributeNames = property(lambda self: copy.copy(self._attribute_names))
196 202
197 def __contains__(self, fieldname): 203 def __contains__(self, fieldname):
198 return (fieldname in self.fieldNames()) \ 204 return (fieldname in self.fieldNames()) \
199 or (fieldname in self.attributeNames()) 205 or (fieldname in self.attributeNames())
1052 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list 1058 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
1053 of fieldnames. The number of arrays must be the same as the number of 1059 of fieldnames. The number of arrays must be the same as the number of
1054 fieldnames. Each set of numpy tensor must have the same first dimension (first 1060 fieldnames. Each set of numpy tensor must have the same first dimension (first
1055 axis) corresponding to the number of examples. 1061 axis) corresponding to the number of examples.
1056 1062
1057 Every tensor is treated as a numpy matrix (using numpy.asmatrix) 1063 Every tensor is treated as a numpy array (using numpy.asarray)
1058 """ 1064 """
1059 ArrayFieldsDataSet.__init__(self,**kwargs) 1065 ArrayFieldsDataSet.__init__(self,**kwargs)
1060 assert len(data_arrays) == len(fieldnames) 1066 assert len(data_arrays) == len(fieldnames)
1061 assert len(fieldnames) > 0 1067 assert len(fieldnames) > 0
1062 all_matrix_sizes = map(lambda x : numpy.asmatrix(x).shape[0] , data_arrays) 1068 ndarrays = [numpy.ndarray(a) for a in data_arrays]
1063 num_examples = max(all_matrix_sizes) 1069 lens = [a.shape[0] for a in ndarrays]
1064 if num_examples == 1 : 1070 num_examples = lens[0] #they must all be equal anyway
1065 # problem, do we transpose all arrays? is there only one example?
1066 raise Exception("wrong initialization, unknow behaviour with 1-d arrays")
1067 self._fieldnames = fieldnames 1071 self._fieldnames = fieldnames
1068 self._datas = [] 1072 self._datas = []
1069 for k in range(len(data_arrays)) : 1073 for k in self.ndarrays :
1070 self._datas.append( numpy.asmatrix(data_arrays[k]) ) 1074 assert k.shape[0] == num_examples
1071 if self._datas[-1].shape[0] == 1 and self._datas[-1].shape[1] == num_examples : 1075 self._datas = ndarrays
1072 self._datas[-1] = self._datas[-1].transpose() 1076 # create dict
1073 for k in range(len(self._datas)) : 1077 self.map_field_idx = dict()
1074 assert self._datas[k].shape[0] == num_examples 1078 for k in range(len(fieldnames)):
1079 self.map_field_idx[fieldnames[k]] = k
1075 1080
1076 1081
1077 def __len__(self) : 1082 def __len__(self) :
1078 """ 1083 """
1079 Length of the dataset is based on the first array = data_arrays[0], using its shape 1084 Length of the dataset is based on the first array = data_arrays[0], using its shape
1083 def fieldNames(self) : 1088 def fieldNames(self) :
1084 """ 1089 """
1085 Returns the fieldnames as set in self.__init__ 1090 Returns the fieldnames as set in self.__init__
1086 """ 1091 """
1087 return self._fieldnames 1092 return self._fieldnames
1093
1094 def field_pos(self,fieldname) :
1095 """
1096 Returns the index of a given fieldname. Fieldname must exists! see fieldNames().
1097 """
1098 return self.map_field_idx[fieldname]
1088 1099
1089 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): 1100 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset):
1090 cursor = Example(fieldnames,[0]*len(fieldnames)) 1101 cursor = Example(fieldnames,[0]*len(fieldnames))
1091 fieldnames = self.fieldNames() if fieldnames is None else fieldnames 1102 fieldnames = self.fieldNames() if fieldnames is None else fieldnames
1092 for n in xrange(n_batches): 1103 for n in xrange(n_batches):
1093 if offset == len(self): 1104 if offset == len(self):
1094 break 1105 break
1095 for f in range(len(cursor._names)) : 1106 for f in range(len(cursor._names)) :
1096 idx = self._fieldnames.index(cursor._names[f]) 1107 idx = self.field_pos(cursor._names[f])
1097 assert idx >= 0 1108 sub_data = self._datas[idx][offset : offset+minibatch_size]
1098 sub_data = self._datas[f][offset : offset+minibatch_size]
1099 cursor._values[f] = sub_data 1109 cursor._values[f] = sub_data
1100 offset += len(sub_data) #can be less than minibatch_size at end 1110 offset += len(sub_data) #can be less than minibatch_size at end
1101 yield cursor 1111 yield cursor
1102 1112
1103 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) 1113 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)