comparison dataset.py @ 7:6f8f338686db

Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
author bengioy@bengiomac.local
date Mon, 24 Mar 2008 13:20:15 -0400
parents d5738b79089a
children d1c394486037
comparison
equal deleted inserted replaced
6:d5738b79089a 7:6f8f338686db
18 def __init__(self,minibatch_size=1): 18 def __init__(self,minibatch_size=1):
19 assert minibatch_size>0 19 assert minibatch_size>0
20 self.minibatch_size=minibatch_size 20 self.minibatch_size=minibatch_size
21 21
22 def __iter__(self): 22 def __iter__(self):
23 return self 23 """
24 24 Return an iterator, whose next() method returns the next example or the next
25 def next(self): 25 minibatch in the dataset. A minibatch (of length > 1) should be something one
26 """ 26 can iterate on again in order to obtain the individual examples. If the dataset
27 Return the next example or the next minibatch in the dataset. 27 has fields, then the example or the minibatch must have the same fields
28 A minibatch (of length > 1) should be something one can iterate on again in order
29 to obtain the individual examples. If the dataset has fields,
30 then the example or the minibatch must have the same fields
31 (typically this is implemented by returning another (small) dataset, when 28 (typically this is implemented by returning another (small) dataset, when
32 there are fields). 29 there are fields).
33 """ 30 """
34 raise NotImplementedError 31 raise NotImplementedError
35 32
53 """ 50 """
54 51
55 def __init__(self,minibatch_size): 52 def __init__(self,minibatch_size):
56 DataSet.__init__(self,minibatch_size) 53 DataSet.__init__(self,minibatch_size)
57 54
55 def __iter__(self):
56 return FiniteDataSetIterator(self)
57
58 def __len__(self): 58 def __len__(self):
59 """len(dataset) returns the number of examples in the dataset.""" 59 """len(dataset) returns the number of examples in the dataset."""
60 raise NotImplementedError 60 raise NotImplementedError
61 61
62 def __getitem__(self,i): 62 def __getitem__(self,i):
64 raise NotImplementedError 64 raise NotImplementedError
65 65
66 def __getslice__(self,*slice_args): 66 def __getslice__(self,*slice_args):
67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
68 raise NotImplementedError 68 raise NotImplementedError
69
70 class FiniteDataSetIterator(object):
71 def __init__(self,dataset):
72 self.dataset=dataset
73 self.current = -self.dataset.minibatch_size
74
75 def next(self):
76 """
77 Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that
78 many examples. If the dataset has fields, the example or the minibatch of examples
79 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
80 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
81 example-wise on it. On the other hand, if the dataset has no fields (e.g. because
82 it is already the field of a bigger dataset), then the returned example or minibatch
83 may be any indexable object, such as a numpy array. Following the array semantics of indexing
84 and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array
85 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
86 to a row. Again, if the minibatch_size is >1, one can iterate on the result to
87 obtain individual examples (as rows).
88 """
89 self.current+=self.dataset.minibatch_size
90 if self.current>=len(self.dataset):
91 self.current=-self.dataset.minibatch_size
92 raise StopIteration
93 if self.dataset.minibatch_size==1:
94 return self.dataset[self.current]
95 else:
96 return self.dataset[self.current:self.current+self.dataset.minibatch_size]
97
69 98
70 # we may want ArrayDataSet defined in another python file 99 # we may want ArrayDataSet defined in another python file
71 100
72 import numpy 101 import numpy
73 102
86 Construct an ArrayDataSet, either from a DataSet, or from 115 Construct an ArrayDataSet, either from a DataSet, or from
87 a numpy array plus an optional specification of fields (by 116 a numpy array plus an optional specification of fields (by
88 a dictionary of column slices indexed by field names). 117 a dictionary of column slices indexed by field names).
89 """ 118 """
90 FiniteDataSet.__init__(self,minibatch_size) 119 FiniteDataSet.__init__(self,minibatch_size)
91 self.current_row=-1 # used for view of this dataset as an iterator
92 if dataset!=None: 120 if dataset!=None:
93 assert data==None and fields=={} 121 assert data==None and fields=={}
94 # convert dataset to an ArrayDataSet 122 # convert dataset to an ArrayDataSet
95 raise NotImplementedError 123 raise NotImplementedError
96 if data!=None: 124 if data!=None:
106 if not start: 134 if not start:
107 start=0 135 start=0
108 if not step: 136 if not step:
109 step=1 137 step=1
110 if not fieldslice.start or not fieldslice.step: 138 if not fieldslice.start or not fieldslice.step:
111 fieldslice = slice(start,fieldslice.stop,step) 139 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
112 # and coherent with the data array 140 # and coherent with the data array
113 assert fieldslice.start>=0 and fieldslice.stop<=self.width 141 assert fieldslice.start>=0 and fieldslice.stop<=self.width
114 assert minibatch_size<=len(self.data) 142 assert minibatch_size<=len(self.data)
115 143
116 def next(self):
117 """
118 Return the next example(s) in the dataset. If self.minibatch_size>1 return that
119 many examples. If the dataset has fields, the example or the minibatch of examples
120 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed),
121 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate
122 example-wise on it. On the other hand, if the dataset has no fields (e.g. because
123 it is already the field of a bigger dataset), then the returned example or minibatch
124 is a numpy array. Following the array semantics of indexing and slicing,
125 if the minibatch_size is 1 (and there are no fields), then the result is an array
126 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding
127 to a row. Again, if the minibatch_size is >1, one can iterate on the result to
128 obtain individual examples (as rows).
129 """
130 if self.fields:
131 self.current_row+=self.minibatch_size
132 if self.current_row>=len(self.data):
133 self.current_row=-self.minibatch_size
134 raise StopIteration
135 if self.minibatch_size==1:
136 return self[self.current_row]
137 else:
138 return self[self.current_row:self.current_row+self.minibatch_size]
139 else:
140 if self.minibatch_size==1:
141 return self.data[self.current_row]
142 else:
143 return self.data[self.current_row:self.current_row+self.minibatch_size]
144
145 def __getattr__(self,fieldname): 144 def __getattr__(self,fieldname):
146 """Return a numpy array with the content associated with the given field name.""" 145 """
147 return self.data[self.fields[fieldname]] 146 Return a numpy array with the content associated with the given field name.
147 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
148 than the dataset.data) is returned.
149 """
150 if len(self.data)==1:
151 return self.data[0,self.fields[fieldname]]
152 return self.data[:,self.fields[fieldname]]
148 153
149 def __call__(self,*fieldnames): 154 def __call__(self,*fieldnames):
150 """Return a sub-dataset containing only the given fieldnames as fields.""" 155 """Return a sub-dataset containing only the given fieldnames as fields."""
151 min_col=self.data.shape[1] 156 min_col=self.data.shape[1]
152 max_col=0 157 max_col=0
174 if self.fields: 179 if self.fields:
175 if isinstance(i,slice): 180 if isinstance(i,slice):
176 return ArrayDataSet(data=data[slice],fields=self.fields) 181 return ArrayDataSet(data=data[slice],fields=self.fields)
177 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) 182 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields)
178 else: 183 else:
179 return data[i] 184 return self.data[i]
180 185
181 def __getslice__(self,*slice_args): 186 def __getslice__(self,*slice_args):
182 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" 187 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
183 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) 188 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)
184 189
185 def asarray(self): 190 def asarray(self):
186 if self.fields: 191 if not self.fields:
187 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) 192 return self.data
188 for field_slice in self.fields.values(): 193 # else, select subsets of columns mapped by the fields
189 for c in xrange(field_slice.start,field_slice.stop,field_slice.step): 194 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
190 columns_used[c]=True 195 for field_slice in self.fields.values():
191 # try to figure out if we can map all the slices into one slice: 196 for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
192 mappable_to_one_slice = True 197 columns_used[c]=True
193 start=0 198 # try to figure out if we can map all the slices into one slice:
194 while start<len(columns_used) and not columns_used[start]: 199 mappable_to_one_slice = True
195 start+=1 200 start=0
196 stop=len(columns_used) 201 while start<len(columns_used) and not columns_used[start]:
197 while stop>0 and not columns_used[stop-1]: 202 start+=1
198 stop-=1 203 stop=len(columns_used)
199 step=0 204 while stop>0 and not columns_used[stop-1]:
200 i=start 205 stop-=1
201 while i<stop: 206 step=0
202 j=i+1 207 i=start
203 while not columns_used[j] and j<stop: 208 while i<stop:
204 j+=1 209 j=i+1
205 if step: 210 while j<stop and not columns_used[j]:
206 if step!=j-i: 211 j+=1
207 mappable_to_one_slice = False 212 if step:
208 break 213 if step!=j-i:
209 else: 214 mappable_to_one_slice = False
210 step = j-i 215 break
211 if mappable_to_one_slice: 216 else:
212 return data[slice(start,stop,step)] 217 step = j-i
213 # else make contiguous copy 218 i=j
214 n_columns = sum(columns_used) 219 if mappable_to_one_slice:
215 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) 220 return self.data[:,slice(start,stop,step)]
216 c=0 221 # else make contiguous copy
217 for field_slice in self.fields.values(): 222 n_columns = sum(columns_used)
218 slice_width=field_slice.stop-field_slice.start/field_slice.step 223 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
219 # copy the field here 224 print result.shape
220 result[:,slice(c,slice_width)]=self.data[field_slice] 225 c=0
221 c+=slice_width 226 for field_slice in self.fields.values():
222 return result 227 slice_width=field_slice.stop-field_slice.start/field_slice.step
223 return self.data 228 # copy the field here
224 229 result[:,slice(c,slice_width)]=self.data[:,field_slice]
230 c+=slice_width
231 return result