Mercurial > pylearn
comparison dataset.py @ 316:5fe6d0c93109
getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
author | Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca> |
---|---|
date | Wed, 11 Jun 2008 16:28:09 -0400 |
parents | 105b54ac8260 |
children | 14081904d8f3 |
comparison
equal
deleted
inserted
replaced
315:b48cf8dce2bf | 316:5fe6d0c93109 |
---|---|
1026 def untested_valuesVStack(self,fieldname,values): | 1026 def untested_valuesVStack(self,fieldname,values): |
1027 """Concatenate field values vertically, e.g. two vectors | 1027 """Concatenate field values vertically, e.g. two vectors |
1028 become a two-row matrix, two matrices become a longer matrix, etc.""" | 1028 become a two-row matrix, two matrices become a longer matrix, etc.""" |
1029 return numpy.vstack(values) | 1029 return numpy.vstack(values) |
1030 | 1030 |
1031 | |
1032 | |
1033 class NArraysDataSet(ArrayFieldsDataSet) : | |
1034 """ | |
1035 An NArraysDataSet stores fields that are numpy tensor, whose first axis | |
1036 iterates over examples. It's a generalization of ArrayDataSet. | |
1037 """ | |
1038 #@TODO not completely implemented yet | |
1039 def __init__(self, data_arrays, fieldnames, **kwargs) : | |
1040 """ | |
1041 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list | |
1042 of fieldnames. The number of arrays must be the same as the number of | |
1043 fieldnames. Each set of numpy tensor must have the same first dimension (first | |
1044 axis) corresponding to the number of examples. | |
1045 | |
1046 Every tensor is treated as a numpy matrix (using numpy.asmatrix) | |
1047 """ | |
1048 ArrayFieldsDataSet.__init(self,**kwargs) | |
1049 assert len(data_arrays) == len(fieldnames) | |
1050 assert len(fieldnames) > 0 | |
1051 num_examples = numpy.asmatrix(data_arrays[0]).shape[0] | |
1052 for k in range(data_arrays) : | |
1053 assert numpy.asmatrix(data_arrays[k]).shape[0] == num_examples | |
1054 self._fieldnames = fieldnames | |
1055 self._datas = [] | |
1056 for k in range(data_arrays) : | |
1057 self._datas.append( numpy.asmatrix(data_arrays[k]) ) | |
1058 raise NotImplemented | |
1059 | |
1060 | |
1061 def __len__(self) : | |
1062 """ | |
1063 Length of the dataset is based on the first array = data_arrays[0], using its shape | |
1064 """ | |
1065 return self.datas[0].shape[0] | |
1066 | |
1067 def fieldNames(self) : | |
1068 """ | |
1069 Returns the fieldnames as set in self.__init__ | |
1070 """ | |
1071 return self._fieldnames | |
1072 | |
1073 | |
1074 | |
1031 class ArrayDataSet(ArrayFieldsDataSet): | 1075 class ArrayDataSet(ArrayFieldsDataSet): |
1032 """ | 1076 """ |
1033 An ArrayDataSet stores the fields as groups of columns in a numpy tensor, | 1077 An ArrayDataSet stores the fields as groups of columns in a numpy tensor, |
1034 whose first axis iterates over examples, second axis determines fields. | 1078 whose first axis iterates over examples, second axis determines fields. |
1035 If the underlying array is N-dimensional (has N axes), then the field | 1079 If the underlying array is N-dimensional (has N axes), then the field |
1075 return self.fields_columns.keys() | 1119 return self.fields_columns.keys() |
1076 | 1120 |
1077 def __len__(self): | 1121 def __len__(self): |
1078 return len(self.data) | 1122 return len(self.data) |
1079 | 1123 |
1080 def dontuse__getitem__(self,key): | 1124 def __getitem__(self,key): |
1081 """More efficient implementation than the default __getitem__""" | 1125 """More efficient implementation than the default __getitem__""" |
1082 fieldnames=self.fields_columns.keys() | 1126 fieldnames=self.fields_columns.keys() |
1083 values=self.fields_columns.values() | 1127 values=self.fields_columns.values() |
1084 if type(key) is int: | 1128 if type(key) is int: |
1085 return Example(fieldnames, | 1129 return Example(fieldnames, |
1086 [self.data[key,col] for col in values]) | 1130 [self.data[key,col] for col in values]) |
1087 if type(key) is slice: | 1131 if type(key) is slice: |
1088 return MinibatchDataSet(Example(fieldnames, | 1132 return Example(fieldnames,[self.data[key,col] for col in values]) |
1089 [self.data[key,col] for col in values])) | |
1090 if type(key) is list: | 1133 if type(key) is list: |
1091 for i in range(len(key)): | 1134 for i in range(len(key)): |
1092 if self.hasFields(key[i]): | 1135 if self.hasFields(key[i]): |
1093 key[i]=self.fields_columns[key[i]] | 1136 key[i]=self.fields_columns[key[i]] |
1094 return MinibatchDataSet(Example(fieldnames, | 1137 return Example(fieldnames, |
1095 #we must separate differently for list as numpy | 1138 #we must separate differently for list as numpy |
1096 # doesn't support self.data[[i1,...],[i2,...]] | 1139 # doesn't support self.data[[i1,...],[i2,...]] |
1097 # when their is more then two i1 and i2 | 1140 # when their is more then two i1 and i2 |
1098 [self.data[key,:][:,col] | 1141 [self.data[key,:][:,col] |
1099 if isinstance(col,list) else | 1142 if isinstance(col,list) else |
1100 self.data[key,col] for col in values]), | 1143 self.data[key,col] for col in values]) |
1101 | |
1102 | |
1103 self.valuesVStack,self.valuesHStack) | |
1104 | 1144 |
1105 # else check for a fieldname | 1145 # else check for a fieldname |
1106 if self.hasFields(key): | 1146 if self.hasFields(key): |
1107 return self.data[:,self.fields_columns[key]] | 1147 return self.data[:,self.fields_columns[key]] |
1108 # else we are trying to access a property of the dataset | 1148 # else we are trying to access a property of the dataset |