comparison dataset.py @ 316:5fe6d0c93109

getitem in ArrayDataSet is set up again, supposed to be faster than default one, has been tested agains the default behaviour. In particular, now always return a LookupList
author Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca>
date Wed, 11 Jun 2008 16:28:09 -0400
parents 105b54ac8260
children 14081904d8f3
comparison
equal deleted inserted replaced
315:b48cf8dce2bf 316:5fe6d0c93109
1026 def untested_valuesVStack(self,fieldname,values): 1026 def untested_valuesVStack(self,fieldname,values):
1027 """Concatenate field values vertically, e.g. two vectors 1027 """Concatenate field values vertically, e.g. two vectors
1028 become a two-row matrix, two matrices become a longer matrix, etc.""" 1028 become a two-row matrix, two matrices become a longer matrix, etc."""
1029 return numpy.vstack(values) 1029 return numpy.vstack(values)
1030 1030
1031
1032
1033 class NArraysDataSet(ArrayFieldsDataSet) :
1034 """
1035 An NArraysDataSet stores fields that are numpy tensor, whose first axis
1036 iterates over examples. It's a generalization of ArrayDataSet.
1037 """
1038 #@TODO not completely implemented yet
1039 def __init__(self, data_arrays, fieldnames, **kwargs) :
1040 """
1041 Construct an NArraysDataSet from a list of numpy tensor (data_arrays) and a list
1042 of fieldnames. The number of arrays must be the same as the number of
1043 fieldnames. Each set of numpy tensor must have the same first dimension (first
1044 axis) corresponding to the number of examples.
1045
1046 Every tensor is treated as a numpy matrix (using numpy.asmatrix)
1047 """
1048 ArrayFieldsDataSet.__init(self,**kwargs)
1049 assert len(data_arrays) == len(fieldnames)
1050 assert len(fieldnames) > 0
1051 num_examples = numpy.asmatrix(data_arrays[0]).shape[0]
1052 for k in range(data_arrays) :
1053 assert numpy.asmatrix(data_arrays[k]).shape[0] == num_examples
1054 self._fieldnames = fieldnames
1055 self._datas = []
1056 for k in range(data_arrays) :
1057 self._datas.append( numpy.asmatrix(data_arrays[k]) )
1058 raise NotImplemented
1059
1060
1061 def __len__(self) :
1062 """
1063 Length of the dataset is based on the first array = data_arrays[0], using its shape
1064 """
1065 return self.datas[0].shape[0]
1066
1067 def fieldNames(self) :
1068 """
1069 Returns the fieldnames as set in self.__init__
1070 """
1071 return self._fieldnames
1072
1073
1074
1031 class ArrayDataSet(ArrayFieldsDataSet): 1075 class ArrayDataSet(ArrayFieldsDataSet):
1032 """ 1076 """
1033 An ArrayDataSet stores the fields as groups of columns in a numpy tensor, 1077 An ArrayDataSet stores the fields as groups of columns in a numpy tensor,
1034 whose first axis iterates over examples, second axis determines fields. 1078 whose first axis iterates over examples, second axis determines fields.
1035 If the underlying array is N-dimensional (has N axes), then the field 1079 If the underlying array is N-dimensional (has N axes), then the field
1075 return self.fields_columns.keys() 1119 return self.fields_columns.keys()
1076 1120
1077 def __len__(self): 1121 def __len__(self):
1078 return len(self.data) 1122 return len(self.data)
1079 1123
1080 def dontuse__getitem__(self,key): 1124 def __getitem__(self,key):
1081 """More efficient implementation than the default __getitem__""" 1125 """More efficient implementation than the default __getitem__"""
1082 fieldnames=self.fields_columns.keys() 1126 fieldnames=self.fields_columns.keys()
1083 values=self.fields_columns.values() 1127 values=self.fields_columns.values()
1084 if type(key) is int: 1128 if type(key) is int:
1085 return Example(fieldnames, 1129 return Example(fieldnames,
1086 [self.data[key,col] for col in values]) 1130 [self.data[key,col] for col in values])
1087 if type(key) is slice: 1131 if type(key) is slice:
1088 return MinibatchDataSet(Example(fieldnames, 1132 return Example(fieldnames,[self.data[key,col] for col in values])
1089 [self.data[key,col] for col in values]))
1090 if type(key) is list: 1133 if type(key) is list:
1091 for i in range(len(key)): 1134 for i in range(len(key)):
1092 if self.hasFields(key[i]): 1135 if self.hasFields(key[i]):
1093 key[i]=self.fields_columns[key[i]] 1136 key[i]=self.fields_columns[key[i]]
1094 return MinibatchDataSet(Example(fieldnames, 1137 return Example(fieldnames,
1095 #we must separate differently for list as numpy 1138 #we must separate differently for list as numpy
1096 # doesn't support self.data[[i1,...],[i2,...]] 1139 # doesn't support self.data[[i1,...],[i2,...]]
1097 # when their is more then two i1 and i2 1140 # when their is more then two i1 and i2
1098 [self.data[key,:][:,col] 1141 [self.data[key,:][:,col]
1099 if isinstance(col,list) else 1142 if isinstance(col,list) else
1100 self.data[key,col] for col in values]), 1143 self.data[key,col] for col in values])
1101
1102
1103 self.valuesVStack,self.valuesHStack)
1104 1144
1105 # else check for a fieldname 1145 # else check for a fieldname
1106 if self.hasFields(key): 1146 if self.hasFields(key):
1107 return self.data[:,self.fields_columns[key]] 1147 return self.data[:,self.fields_columns[key]]
1108 # else we are trying to access a property of the dataset 1148 # else we are trying to access a property of the dataset