Mercurial > pylearn
comparison dataset.py @ 321:f03ae06fadc8
NArraysDataSet improved, use arrays instead of matrix, also a dictionnary of field indexes
author | Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca> |
---|---|
date | Thu, 12 Jun 2008 12:35:47 -0400 |
parents | b1da46b9b901 |
children | ad8be93b3c55 |
comparison
equal
deleted
inserted
replaced
320:b1da46b9b901 | 321:f03ae06fadc8 |
---|---|
190 | 190 |
191 self.description = default_desc() if description is None \ | 191 self.description = default_desc() if description is None \ |
192 else description | 192 else description |
193 self._attribute_names = ["description"] | 193 self._attribute_names = ["description"] |
194 | 194 |
195 # create dictionnary of fieldnames index | |
196 self.map_field_idx = dict() | |
197 for k in len(range(self.fieldNames())): | |
198 map_field_idx[ self.fieldNames[k] ] = k | |
199 | |
200 | |
195 attributeNames = property(lambda self: copy.copy(self._attribute_names)) | 201 attributeNames = property(lambda self: copy.copy(self._attribute_names)) |
196 | 202 |
197 def __contains__(self, fieldname): | 203 def __contains__(self, fieldname): |
198 return (fieldname in self.fieldNames()) \ | 204 return (fieldname in self.fieldNames()) \ |
199 or (fieldname in self.attributeNames()) | 205 or (fieldname in self.attributeNames()) |
1052 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list | 1058 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list |
1053 of fieldnames. The number of arrays must be the same as the number of | 1059 of fieldnames. The number of arrays must be the same as the number of |
1054 fieldnames. Each set of numpy tensor must have the same first dimension (first | 1060 fieldnames. Each set of numpy tensor must have the same first dimension (first |
1055 axis) corresponding to the number of examples. | 1061 axis) corresponding to the number of examples. |
1056 | 1062 |
1057 Every tensor is treated as a numpy matrix (using numpy.asmatrix) | 1063 Every tensor is treated as a numpy array (using numpy.asarray) |
1058 """ | 1064 """ |
1059 ArrayFieldsDataSet.__init__(self,**kwargs) | 1065 ArrayFieldsDataSet.__init__(self,**kwargs) |
1060 assert len(data_arrays) == len(fieldnames) | 1066 assert len(data_arrays) == len(fieldnames) |
1061 assert len(fieldnames) > 0 | 1067 assert len(fieldnames) > 0 |
1062 all_matrix_sizes = map(lambda x : numpy.asmatrix(x).shape[0] , data_arrays) | 1068 ndarrays = [numpy.ndarray(a) for a in data_arrays] |
1063 num_examples = max(all_matrix_sizes) | 1069 lens = [a.shape[0] for a in ndarrays] |
1064 if num_examples == 1 : | 1070 num_examples = lens[0] #they must all be equal anyway |
1065 # problem, do we transpose all arrays? is there only one example? | |
1066 raise Exception("wrong initialization, unknow behaviour with 1-d arrays") | |
1067 self._fieldnames = fieldnames | 1071 self._fieldnames = fieldnames |
1068 self._datas = [] | 1072 self._datas = [] |
1069 for k in range(len(data_arrays)) : | 1073 for k in self.ndarrays : |
1070 self._datas.append( numpy.asmatrix(data_arrays[k]) ) | 1074 assert k.shape[0] == num_examples |
1071 if self._datas[-1].shape[0] == 1 and self._datas[-1].shape[1] == num_examples : | 1075 self._datas = ndarrays |
1072 self._datas[-1] = self._datas[-1].transpose() | 1076 # create dict |
1073 for k in range(len(self._datas)) : | 1077 self.map_field_idx = dict() |
1074 assert self._datas[k].shape[0] == num_examples | 1078 for k in range(len(fieldnames)): |
1079 self.map_field_idx[fieldnames[k]] = k | |
1075 | 1080 |
1076 | 1081 |
1077 def __len__(self) : | 1082 def __len__(self) : |
1078 """ | 1083 """ |
1079 Length of the dataset is based on the first array = data_arrays[0], using its shape | 1084 Length of the dataset is based on the first array = data_arrays[0], using its shape |
1083 def fieldNames(self) : | 1088 def fieldNames(self) : |
1084 """ | 1089 """ |
1085 Returns the fieldnames as set in self.__init__ | 1090 Returns the fieldnames as set in self.__init__ |
1086 """ | 1091 """ |
1087 return self._fieldnames | 1092 return self._fieldnames |
1093 | |
1094 def field_pos(self,fieldname) : | |
1095 """ | |
1096 Returns the index of a given fieldname. Fieldname must exists! see fieldNames(). | |
1097 """ | |
1098 return self.map_field_idx[fieldname] | |
1088 | 1099 |
1089 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): | 1100 def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): |
1090 cursor = Example(fieldnames,[0]*len(fieldnames)) | 1101 cursor = Example(fieldnames,[0]*len(fieldnames)) |
1091 fieldnames = self.fieldNames() if fieldnames is None else fieldnames | 1102 fieldnames = self.fieldNames() if fieldnames is None else fieldnames |
1092 for n in xrange(n_batches): | 1103 for n in xrange(n_batches): |
1093 if offset == len(self): | 1104 if offset == len(self): |
1094 break | 1105 break |
1095 for f in range(len(cursor._names)) : | 1106 for f in range(len(cursor._names)) : |
1096 idx = self._fieldnames.index(cursor._names[f]) | 1107 idx = self.field_pos(cursor._names[f]) |
1097 assert idx >= 0 | 1108 sub_data = self._datas[idx][offset : offset+minibatch_size] |
1098 sub_data = self._datas[f][offset : offset+minibatch_size] | |
1099 cursor._values[f] = sub_data | 1109 cursor._values[f] = sub_data |
1100 offset += len(sub_data) #can be less than minibatch_size at end | 1110 offset += len(sub_data) #can be less than minibatch_size at end |
1101 yield cursor | 1111 yield cursor |
1102 | 1112 |
1103 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) | 1113 #return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) |