Mercurial > ift6266
comparison datasets/ftfile.py @ 257:966272e7f14b
Make the datasets lazy-loading and add a maxsize parameter.
author | Arnaud Bergeron <abergeron@gmail.com> |
---|---|
date | Tue, 16 Mar 2010 18:51:27 -0400 |
parents | 1faae5079522 |
children | a92ec9939e4f |
comparison
equal
deleted
inserted
replaced
248:7e6fecabb656 | 257:966272e7f14b |
---|---|
87 if self.scale != 1: | 87 if self.scale != 1: |
88 res /= self.scale | 88 res /= self.scale |
89 return res | 89 return res |
90 | 90 |
91 class FTSource(object): | 91 class FTSource(object): |
92 def __init__(self, file, skip=0, size=None, dtype=None, scale=1): | 92 def __init__(self, file, skip=0, size=None, maxsize=None, |
93 dtype=None, scale=1): | |
93 r""" | 94 r""" |
94 Create a data source from a possible subset of a .ft file. | 95 Create a data source from a possible subset of a .ft file. |
95 | 96 |
96 Parameters: | 97 Parameters: |
97 `file` (string) -- the filename | 98 `file` -- (string) the filename |
98 `skip` (int, optional) -- amount of examples to skip from | 99 `skip` -- (int, optional) amount of examples to skip from |
99 the start of the file. If | 100 the start of the file. If negative, skips |
100 negative, skips filesize - skip. | 101 filesize - skip. |
101 `size` (int, optional) -- truncates number of examples | 102 `size` -- (int, optional) truncates number of examples |
102 read (after skipping). If | 103 read (after skipping). If negative truncates to |
103 negative truncates to | 104 filesize - size (also after skipping). |
104 filesize - size | 105 `maxsize` -- (int, optional) the maximum size of the file |
105 (also after skipping). | 106 `dtype` -- (dtype, optional) convert the data to this |
106 `dtype` (dtype, optional) -- convert the data to this | 107 dtype after reading. |
107 dtype after reading. | 108 `scale` -- (number, optional) scale (that is divide) the |
108 `scale` (number, optional) -- scale (that is divide) the | 109 data by this number (after dtype conversion, if |
109 data by this number (after | 110 any). |
110 dtype conversion, if any). | 111 |
111 | 112 Tests: |
112 Tests: | 113 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') |
113 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') | 114 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) |
114 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) | 115 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) |
115 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) | 116 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120) |
116 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120) | |
117 """ | 117 """ |
118 self.file = file | 118 self.file = file |
119 self.skip = skip | 119 self.skip = skip |
120 self.size = size | 120 self.size = size |
121 self.dtype = dtype | 121 self.dtype = dtype |
122 self.scale = scale | 122 self.scale = scale |
123 self.maxsize = maxsize | |
123 | 124 |
124 def open(self): | 125 def open(self): |
125 r""" | 126 r""" |
126 Returns an FTFile that corresponds to this dataset. | 127 Returns an FTFile that corresponds to this dataset. |
127 | 128 |
128 Tests: | 129 Tests: |
129 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') | 130 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') |
130 >>> f = s.open() | 131 >>> f = s.open() |
131 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1) | 132 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1) |
132 >>> len(s.open().read(2)) | 133 >>> len(s.open().read(2)) |
133 1 | 134 1 |
134 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646) | 135 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646) |
135 >>> s.open().size | 136 >>> s.open().size |
136 1000 | 137 1000 |
137 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) | 138 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) |
138 >>> s.open().size | 139 >>> s.open().size |
139 1 | 140 1 |
140 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) | 141 >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) |
141 >>> s.open().size | 142 >>> s.open().size |
142 58636 | 143 58636 |
143 """ | 144 """ |
144 f = FTFile(self.file, scale=self.scale, dtype=self.dtype) | 145 f = FTFile(self.file, scale=self.scale, dtype=self.dtype) |
145 if self.skip != 0: | 146 if self.skip != 0: |
146 f.skip(self.skip) | 147 f.skip(self.skip) |
147 if self.size is not None and self.size < f.size: | 148 if self.size is not None and self.size < f.size: |
148 if self.size < 0: | 149 if self.size < 0: |
149 f.size += self.size | 150 f.size += self.size |
151 if f.size < 0: | |
152 f.size = 0 | |
150 else: | 153 else: |
151 f.size = self.size | 154 f.size = self.size |
155 if self.maxsize is not None and f.size > self.maxsize: | |
156 f.size = self.maxsize | |
152 return f | 157 return f |
153 | 158 |
154 class FTData(object): | 159 class FTData(object): |
155 r""" | 160 r""" |
156 This is a list of FTSources. | 161 This is a list of FTSources. |
157 """ | 162 """ |
158 def __init__(self, datafiles, labelfiles, skip=0, size=None, | 163 def __init__(self, datafiles, labelfiles, skip=0, size=None, maxsize=None, |
159 inscale=1, indtype=None, outscale=1, outdtype=None): | 164 inscale=1, indtype=None, outscale=1, outdtype=None): |
160 self.inputs = [FTSource(f, skip, size, scale=inscale, dtype=indtype) | 165 if maxsize is not None: |
166 maxsize /= len(datafiles) | |
167 self.inputs = [FTSource(f, skip, size, maxsize, scale=inscale, dtype=indtype) | |
161 for f in datafiles] | 168 for f in datafiles] |
162 self.outputs = [FTSource(f, skip, size, scale=outscale, dtype=outdtype) | 169 self.outputs = [FTSource(f, skip, size, maxsize, scale=outscale, dtype=outdtype) |
163 for f in labelfiles] | 170 for f in labelfiles] |
164 | 171 |
165 def open_inputs(self): | 172 def open_inputs(self): |
166 return [f.open() for f in self.inputs] | 173 return [f.open() for f in self.inputs] |
167 | 174 |
168 def open_outputs(self): | 175 def open_outputs(self): |
169 return [f.open() for f in self.outputs] | 176 return [f.open() for f in self.outputs] |
170 | 177 |
171 | 178 |
172 class FTDataSet(DataSet): | 179 class FTDataSet(DataSet): |
173 def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None, indtype=None, outdtype=None, inscale=1, outscale=1): | 180 def __init__(self, train_data, train_lbl, test_data, test_lbl, |
181 valid_data=None, valid_lbl=None, indtype=None, outdtype=None, | |
182 inscale=1, outscale=1, maxsize=None): | |
174 r""" | 183 r""" |
175 Defines a DataSet from a bunch of files. | 184 Defines a DataSet from a bunch of files. |
176 | 185 |
177 Parameters: | 186 Parameters: |
178 `train_data` -- list of train data files | 187 `train_data` -- list of train data files |
182 can differ from train. | 191 can differ from train. |
183 `valid_data`, `valid_labels` -- same thing again for validation. | 192 `valid_data`, `valid_labels` -- same thing again for validation. |
184 (optional) | 193 (optional) |
185 `indtype`, `outdtype`, -- see FTSource.__init__() | 194 `indtype`, `outdtype`, -- see FTSource.__init__() |
186 `inscale`, `outscale` (optional) | 195 `inscale`, `outscale` (optional) |
196 `maxsize` -- maximum size of the set returned | |
187 | 197 |
188 | 198 |
189 If `valid_data` and `valid_labels` are not supplied then a sample | 199 If `valid_data` and `valid_labels` are not supplied then a sample |
190 approximately equal in size to the test set is taken from the train | 200 approximately equal in size to the test set is taken from the train |
191 set. | 201 set. |
192 """ | 202 """ |
193 if valid_data is None: | 203 if valid_data is None: |
194 total_valid_size = sum(FTFile(td).size for td in test_data) | 204 total_valid_size = min(sum(FTFile(td).size for td in test_data), maxsize) |
195 valid_size = total_valid_size/len(train_data) | 205 valid_size = total_valid_size/len(train_data) |
196 self._train = FTData(train_data, train_lbl, size=-valid_size, | 206 self._train = FTData(train_data, train_lbl, size=-valid_size, |
197 inscale=inscale, outscale=outscale, indtype=indtype, | 207 inscale=inscale, outscale=outscale, |
198 outdtype=outdtype) | 208 indtype=indtype, outdtype=outdtype, |
209 maxsize=maxsize) | |
199 self._valid = FTData(train_data, train_lbl, skip=-valid_size, | 210 self._valid = FTData(train_data, train_lbl, skip=-valid_size, |
200 inscale=inscale, outscale=outscale, indtype=indtype, | 211 inscale=inscale, outscale=outscale, |
201 outdtype=outdtype) | 212 indtype=indtype, outdtype=outdtype, |
213 maxsize=maxsize) | |
202 else: | 214 else: |
203 self._train = FTData(train_data, train_lbl,inscale=inscale, | 215 self._train = FTData(train_data, train_lbl, maxsize=maxsize, |
204 outscale=outscale, indtype=indtype, outdtype=outdtype) | 216 inscale=inscale, outscale=outscale, |
205 self._valid = FTData(valid_data, valid_lbl,inscale=inscale, | 217 indtype=indtype, outdtype=outdtype) |
206 outscale=outscale, indtype=indtype, outdtype=outdtype) | 218 self._valid = FTData(valid_data, valid_lbl, maxsize=maxsize, |
207 self._test = FTData(test_data, test_lbl,inscale=inscale, | 219 inscale=inscale, outscale=outscale, |
208 outscale=outscale, indtype=indtype, outdtype=outdtype) | 220 indtype=indtype, outdtype=outdtype) |
221 self._test = FTData(test_data, test_lbl, maxsize=maxsize, | |
222 inscale=inscale, outscale=outscale, | |
223 indtype=indtype, outdtype=outdtype) | |
209 | 224 |
210 def _return_it(self, batchsize, bufsize, ftdata): | 225 def _return_it(self, batchsize, bufsize, ftdata): |
211 return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), | 226 return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), |
212 DataIterator(ftdata.open_outputs(), batchsize, bufsize)) | 227 DataIterator(ftdata.open_outputs(), batchsize, bufsize)) |