Mercurial > pylearn
comparison dataset.py @ 7:6f8f338686db
Moved iterating counter into a FiniteDataSetIterator to allow embedded iterations and multiple threads iterating at the same time on a dataset.
author | bengioy@bengiomac.local |
---|---|
date | Mon, 24 Mar 2008 13:20:15 -0400 |
parents | d5738b79089a |
children | d1c394486037 |
comparison
equal
deleted
inserted
replaced
6:d5738b79089a | 7:6f8f338686db |
---|---|
18 def __init__(self,minibatch_size=1): | 18 def __init__(self,minibatch_size=1): |
19 assert minibatch_size>0 | 19 assert minibatch_size>0 |
20 self.minibatch_size=minibatch_size | 20 self.minibatch_size=minibatch_size |
21 | 21 |
22 def __iter__(self): | 22 def __iter__(self): |
23 return self | 23 """ |
24 | 24 Return an iterator, whose next() method returns the next example or the next |
25 def next(self): | 25 minibatch in the dataset. A minibatch (of length > 1) should be something one |
26 """ | 26 can iterate on again in order to obtain the individual examples. If the dataset |
27 Return the next example or the next minibatch in the dataset. | 27 has fields, then the example or the minibatch must have the same fields |
28 A minibatch (of length > 1) should be something one can iterate on again in order | |
29 to obtain the individual examples. If the dataset has fields, | |
30 then the example or the minibatch must have the same fields | |
31 (typically this is implemented by returning another (small) dataset, when | 28 (typically this is implemented by returning another (small) dataset, when |
32 there are fields). | 29 there are fields). |
33 """ | 30 """ |
34 raise NotImplementedError | 31 raise NotImplementedError |
35 | 32 |
53 """ | 50 """ |
54 | 51 |
55 def __init__(self,minibatch_size): | 52 def __init__(self,minibatch_size): |
56 DataSet.__init__(self,minibatch_size) | 53 DataSet.__init__(self,minibatch_size) |
57 | 54 |
55 def __iter__(self): | |
56 return FiniteDataSetIterator(self) | |
57 | |
58 def __len__(self): | 58 def __len__(self): |
59 """len(dataset) returns the number of examples in the dataset.""" | 59 """len(dataset) returns the number of examples in the dataset.""" |
60 raise NotImplementedError | 60 raise NotImplementedError |
61 | 61 |
62 def __getitem__(self,i): | 62 def __getitem__(self,i): |
64 raise NotImplementedError | 64 raise NotImplementedError |
65 | 65 |
66 def __getslice__(self,*slice_args): | 66 def __getslice__(self,*slice_args): |
67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 67 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
68 raise NotImplementedError | 68 raise NotImplementedError |
69 | |
70 class FiniteDataSetIterator(object): | |
71 def __init__(self,dataset): | |
72 self.dataset=dataset | |
73 self.current = -self.dataset.minibatch_size | |
74 | |
75 def next(self): | |
76 """ | |
77 Return the next example(s) in the dataset. If self.dataset.minibatch_size>1 return that | |
78 many examples. If the dataset has fields, the example or the minibatch of examples | |
79 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), | |
80 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate | |
81 example-wise on it. On the other hand, if the dataset has no fields (e.g. because | |
82 it is already the field of a bigger dataset), then the returned example or minibatch | |
83 may be any indexable object, such as a numpy array. Following the array semantics of indexing | |
84 and slicing, if the minibatch_size is 1 (and there are no fields), then the result is an array | |
85 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding | |
86 to a row. Again, if the minibatch_size is >1, one can iterate on the result to | |
87 obtain individual examples (as rows). | |
88 """ | |
89 self.current+=self.dataset.minibatch_size | |
90 if self.current>=len(self.dataset): | |
91 self.current=-self.dataset.minibatch_size | |
92 raise StopIteration | |
93 if self.dataset.minibatch_size==1: | |
94 return self.dataset[self.current] | |
95 else: | |
96 return self.dataset[self.current:self.current+self.dataset.minibatch_size] | |
97 | |
69 | 98 |
70 # we may want ArrayDataSet defined in another python file | 99 # we may want ArrayDataSet defined in another python file |
71 | 100 |
72 import numpy | 101 import numpy |
73 | 102 |
86 Construct an ArrayDataSet, either from a DataSet, or from | 115 Construct an ArrayDataSet, either from a DataSet, or from |
87 a numpy array plus an optional specification of fields (by | 116 a numpy array plus an optional specification of fields (by |
88 a dictionary of column slices indexed by field names). | 117 a dictionary of column slices indexed by field names). |
89 """ | 118 """ |
90 FiniteDataSet.__init__(self,minibatch_size) | 119 FiniteDataSet.__init__(self,minibatch_size) |
91 self.current_row=-1 # used for view of this dataset as an iterator | |
92 if dataset!=None: | 120 if dataset!=None: |
93 assert data==None and fields=={} | 121 assert data==None and fields=={} |
94 # convert dataset to an ArrayDataSet | 122 # convert dataset to an ArrayDataSet |
95 raise NotImplementedError | 123 raise NotImplementedError |
96 if data!=None: | 124 if data!=None: |
106 if not start: | 134 if not start: |
107 start=0 | 135 start=0 |
108 if not step: | 136 if not step: |
109 step=1 | 137 step=1 |
110 if not fieldslice.start or not fieldslice.step: | 138 if not fieldslice.start or not fieldslice.step: |
111 fieldslice = slice(start,fieldslice.stop,step) | 139 fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step) |
112 # and coherent with the data array | 140 # and coherent with the data array |
113 assert fieldslice.start>=0 and fieldslice.stop<=self.width | 141 assert fieldslice.start>=0 and fieldslice.stop<=self.width |
114 assert minibatch_size<=len(self.data) | 142 assert minibatch_size<=len(self.data) |
115 | 143 |
116 def next(self): | |
117 """ | |
118 Return the next example(s) in the dataset. If self.minibatch_size>1 return that | |
119 many examples. If the dataset has fields, the example or the minibatch of examples | |
120 is just a minibatch_size-rows ArrayDataSet (so that the fields can be accessed), | |
121 but that resulting mini-dataset has a minibatch_size of 1, so that one can iterate | |
122 example-wise on it. On the other hand, if the dataset has no fields (e.g. because | |
123 it is already the field of a bigger dataset), then the returned example or minibatch | |
124 is a numpy array. Following the array semantics of indexing and slicing, | |
125 if the minibatch_size is 1 (and there are no fields), then the result is an array | |
126 with one less dimension (e.g., a vector, if the dataset is a matrix), corresponding | |
127 to a row. Again, if the minibatch_size is >1, one can iterate on the result to | |
128 obtain individual examples (as rows). | |
129 """ | |
130 if self.fields: | |
131 self.current_row+=self.minibatch_size | |
132 if self.current_row>=len(self.data): | |
133 self.current_row=-self.minibatch_size | |
134 raise StopIteration | |
135 if self.minibatch_size==1: | |
136 return self[self.current_row] | |
137 else: | |
138 return self[self.current_row:self.current_row+self.minibatch_size] | |
139 else: | |
140 if self.minibatch_size==1: | |
141 return self.data[self.current_row] | |
142 else: | |
143 return self.data[self.current_row:self.current_row+self.minibatch_size] | |
144 | |
145 def __getattr__(self,fieldname): | 144 def __getattr__(self,fieldname): |
146 """Return a numpy array with the content associated with the given field name.""" | 145 """ |
147 return self.data[self.fields[fieldname]] | 146 Return a numpy array with the content associated with the given field name. |
147 If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension | |
148 than the dataset.data) is returned. | |
149 """ | |
150 if len(self.data)==1: | |
151 return self.data[0,self.fields[fieldname]] | |
152 return self.data[:,self.fields[fieldname]] | |
148 | 153 |
149 def __call__(self,*fieldnames): | 154 def __call__(self,*fieldnames): |
150 """Return a sub-dataset containing only the given fieldnames as fields.""" | 155 """Return a sub-dataset containing only the given fieldnames as fields.""" |
151 min_col=self.data.shape[1] | 156 min_col=self.data.shape[1] |
152 max_col=0 | 157 max_col=0 |
174 if self.fields: | 179 if self.fields: |
175 if isinstance(i,slice): | 180 if isinstance(i,slice): |
176 return ArrayDataSet(data=data[slice],fields=self.fields) | 181 return ArrayDataSet(data=data[slice],fields=self.fields) |
177 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) | 182 return ArrayDataSet(data=self.data[i:i+1],fields=self.fields) |
178 else: | 183 else: |
179 return data[i] | 184 return self.data[i] |
180 | 185 |
181 def __getslice__(self,*slice_args): | 186 def __getslice__(self,*slice_args): |
182 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" | 187 """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.""" |
183 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) | 188 return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields) |
184 | 189 |
185 def asarray(self): | 190 def asarray(self): |
186 if self.fields: | 191 if not self.fields: |
187 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) | 192 return self.data |
188 for field_slice in self.fields.values(): | 193 # else, select subsets of columns mapped by the fields |
189 for c in xrange(field_slice.start,field_slice.stop,field_slice.step): | 194 columns_used = numpy.zeros((self.data.shape[1]),dtype=bool) |
190 columns_used[c]=True | 195 for field_slice in self.fields.values(): |
191 # try to figure out if we can map all the slices into one slice: | 196 for c in xrange(field_slice.start,field_slice.stop,field_slice.step): |
192 mappable_to_one_slice = True | 197 columns_used[c]=True |
193 start=0 | 198 # try to figure out if we can map all the slices into one slice: |
194 while start<len(columns_used) and not columns_used[start]: | 199 mappable_to_one_slice = True |
195 start+=1 | 200 start=0 |
196 stop=len(columns_used) | 201 while start<len(columns_used) and not columns_used[start]: |
197 while stop>0 and not columns_used[stop-1]: | 202 start+=1 |
198 stop-=1 | 203 stop=len(columns_used) |
199 step=0 | 204 while stop>0 and not columns_used[stop-1]: |
200 i=start | 205 stop-=1 |
201 while i<stop: | 206 step=0 |
202 j=i+1 | 207 i=start |
203 while not columns_used[j] and j<stop: | 208 while i<stop: |
204 j+=1 | 209 j=i+1 |
205 if step: | 210 while j<stop and not columns_used[j]: |
206 if step!=j-i: | 211 j+=1 |
207 mappable_to_one_slice = False | 212 if step: |
208 break | 213 if step!=j-i: |
209 else: | 214 mappable_to_one_slice = False |
210 step = j-i | 215 break |
211 if mappable_to_one_slice: | 216 else: |
212 return data[slice(start,stop,step)] | 217 step = j-i |
213 # else make contiguous copy | 218 i=j |
214 n_columns = sum(columns_used) | 219 if mappable_to_one_slice: |
215 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) | 220 return self.data[:,slice(start,stop,step)] |
216 c=0 | 221 # else make contiguous copy |
217 for field_slice in self.fields.values(): | 222 n_columns = sum(columns_used) |
218 slice_width=field_slice.stop-field_slice.start/field_slice.step | 223 result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype) |
219 # copy the field here | 224 print result.shape |
220 result[:,slice(c,slice_width)]=self.data[field_slice] | 225 c=0 |
221 c+=slice_width | 226 for field_slice in self.fields.values(): |
222 return result | 227 slice_width=field_slice.stop-field_slice.start/field_slice.step |
223 return self.data | 228 # copy the field here |
224 | 229 result[:,slice(c,slice_width)]=self.data[:,field_slice] |
230 c+=slice_width | |
231 return result |