Mercurial > ift6266
comparison utils/seriestables/series.py @ 217:de3aef84714a
merge, second try
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Wed, 10 Mar 2010 17:08:50 -0500 |
parents | a96fa4de06d2 |
children | 4c137f16b013 |
comparison
equal
deleted
inserted
replaced
216:c89004f9cab2 | 217:de3aef84714a |
---|---|
1 from tables import * | |
2 import numpy | |
3 | |
4 ''' | |
5 The way these "IsDescription constructor" work is simple: write the | |
6 code as if it were in a file, then exec()ute it, leaving us with | |
7 a local-scoped LocalDescription which may be used to call createTable. | |
8 | |
9 It's a small hack, but it's necessary as the names of the columns | |
10 are retrieved based on the variable name, which we can't programmatically set | |
11 otherwise. | |
12 ''' | |
13 | |
14 def get_beginning_description_n_ints(int_names, int_width=64): | |
15 """ | |
16 Begins construction of a class inheriting from IsDescription | |
17 to construct an HDF5 table with index columns named with int_names. | |
18 | |
19 See Series().__init__ to see how those are used. | |
20 """ | |
21 int_constructor = "Int64Col" | |
22 if int_width == 32: | |
23 int_constructor = "Int32Col" | |
24 | |
25 toexec = "class LocalDescription(IsDescription):\n" | |
26 | |
27 pos = 0 | |
28 | |
29 for n in int_names: | |
30 toexec += "\t" + n + " = " + int_constructor + "(pos=" + str(pos) + ")\n" | |
31 | |
32 return toexec | |
33 | |
34 def get_description_with_n_ints_n_floats(int_names, float_names, int_width=64, float_width=32): | |
35 """ | |
36 Constructs a class to be used when constructing a table with PyTables. | |
37 | |
38 This is useful to construct a series with an index with multiple levels. | |
39 E.g. if you want to index your "validation error" with "epoch" first, then | |
40 "minibatch_index" second, you'd use two "int_names". | |
41 | |
42 Parameters | |
43 ---------- | |
44 int_names : tuple of str | |
45 Names of the int (e.g. index) columns | |
46 float_names : tuple of str | |
47 Names of the float (e.g. error) columns | |
48 int_width : {'32', '64'} | |
49 Type of ints. | |
50 float_width : {'32', '64'} | |
51 Type of floats. | |
52 | |
53 Returns | |
54 ------- | |
55 A class object, to pass to createTable() | |
56 """ | |
57 | |
58 toexec = get_beginning_description_n_ints(int_names, int_width=int_width) | |
59 | |
60 float_constructor = "Float32Col" | |
61 if float_width == 64: | |
62 float_constructor = "Float64Col" | |
63 | |
64 pos = len(int_names) | |
65 | |
66 for n in float_names: | |
67 toexec += "\t" + n + " = " + float_constructor + "(pos=" + str(pos) + ")\n" | |
68 | |
69 exec(toexec) | |
70 | |
71 return LocalDescription | |
72 | |
73 class Series(): | |
74 def __init__(self, table_name, hdf5_file, index_names=('epoch',), title="", hdf5_group='/'): | |
75 """Basic arguments each Series must get. | |
76 | |
77 Parameters | |
78 ---------- | |
79 table_name : str | |
80 Name of the table to create under group "hd5_group" (other parameter). No spaces, ie. follow variable naming restrictions. | |
81 hdf5_file : open HDF5 file | |
82 File opened with openFile() in PyTables (ie. return value of openFile). | |
83 index_names : tuple of str | |
84 Columns to use as index for elements in the series, other example would be ('epoch', 'minibatch'). This would then allow you to call append(index, element) with index made of two ints, one for epoch index, one for minibatch index in epoch. | |
85 title : str | |
86 Title to attach to this table as metadata. Can contain spaces and be longer then the table_name. | |
87 hdf5_group : str | |
88 Path of the group (kind of a file) in the HDF5 file under which to create the table. | |
89 """ | |
90 self.table_name = table_name | |
91 self.hdf5_file = hdf5_file | |
92 self.index_names = index_names | |
93 self.title = title | |
94 | |
95 def append(self, index, element): | |
96 raise NotImplementedError | |
97 | |
98 # To put in a series dictionary instead of a real series, to do nothing | |
99 # when we don't want a given series to be saved. | |
100 class DummySeries(): | |
101 def append(self, index, element): | |
102 pass | |
103 | |
104 class ErrorSeries(Series): | |
105 def __init__(self, error_name, table_name, hdf5_file, index_names=('epoch',), title="", hdf5_group='/'): | |
106 Series.__init__(self, table_name, hdf5_file, index_names, title) | |
107 | |
108 self.error_name = error_name | |
109 | |
110 table_description = self._get_table_description() | |
111 | |
112 self._table = hdf5_file.createTable(hdf5_group, self.table_name, table_description, title=title) | |
113 | |
114 def _get_table_description(self): | |
115 return get_description_with_n_ints_n_floats(self.index_names, (self.error_name,)) | |
116 | |
117 def append(self, index, error): | |
118 """ | |
119 Parameters | |
120 ---------- | |
121 index : tuple of int | |
122 Following index_names passed to __init__, e.g. (12, 15) if index_names were ('epoch', 'minibatch_size') | |
123 error : float | |
124 Next error in the series. | |
125 """ | |
126 if len(index) != len(self.index_names): | |
127 raise ValueError("index provided does not have the right length (expected " \ | |
128 + str(len(self.index_names)) + " got " + str(len(index))) | |
129 | |
130 newrow = self._table.row | |
131 | |
132 # Columns for index in table are based on index_names | |
133 for col_name, value in zip(self.index_names, index): | |
134 newrow[col_name] = value | |
135 newrow[self.error_name] = error | |
136 | |
137 newrow.append() | |
138 | |
139 self.hdf5_file.flush() | |
140 | |
141 # Does not inherit from Series because it does not itself need to | |
142 # access the hdf5_file and does not need a series_name (provided | |
143 # by the base_series.) | |
144 class AccumulatorSeriesWrapper(): | |
145 """ | |
146 | |
147 """ | |
148 def __init__(self, base_series, reduce_every, reduce_function=numpy.mean): | |
149 """ | |
150 Parameters | |
151 ---------- | |
152 base_series : Series | |
153 This object must have an append(index, value) function. | |
154 reduce_every : int | |
155 Apply the reduction function (e.g. mean()) every time we get this number of elements. E.g. if this is 100, then every 100 numbers passed to append(), we'll take the mean and call append(this_mean) on the BaseSeries. | |
156 reduce_function : function | |
157 Must take as input an array of "elements", as passed to (this accumulator's) append(). Basic case would be to take an array of floats and sum them into one float, for example. | |
158 """ | |
159 self.base_series = base_series | |
160 self.reduce_function = reduce_function | |
161 self.reduce_every = reduce_every | |
162 | |
163 self._buffer = [] | |
164 | |
165 | |
166 def append(self, index, element): | |
167 """ | |
168 Parameters | |
169 ---------- | |
170 index : tuple of int | |
171 The index used is the one of the last element reduced. E.g. if | |
172 you accumulate over the first 1000 minibatches, the index | |
173 passed to the base_series.append() function will be 1000. | |
174 element : float | |
175 Element that will be accumulated. | |
176 """ | |
177 self._buffer.append(element) | |
178 | |
179 if len(self._buffer) == self.reduce_every: | |
180 reduced = self.reduce_function(self._buffer) | |
181 self.base_series.append(index, reduced) | |
182 self._buffer = [] | |
183 | |
184 # This should never happen, except if lists | |
185 # were appended, which should be a red flag. | |
186 assert len(self._buffer) < self.reduce_every | |
187 | |
188 # Outside of class to fix an issue with exec in Python 2.6. | |
189 # My sorries to the God of pretty code. | |
190 def _BasicStatisticsSeries_construct_table_toexec(index_names): | |
191 toexec = get_beginning_description_n_ints(index_names) | |
192 | |
193 bpos = len(index_names) | |
194 toexec += "\tmean = Float32Col(pos=" + str(bpos) + ")\n" | |
195 toexec += "\tmin = Float32Col(pos=" + str(bpos+1) + ")\n" | |
196 toexec += "\tmax = Float32Col(pos=" + str(bpos+2) + ")\n" | |
197 toexec += "\tstd = Float32Col(pos=" + str(bpos+3) + ")\n" | |
198 | |
199 # This creates "LocalDescription", which we may then use | |
200 exec(toexec) | |
201 | |
202 return LocalDescription | |
203 | |
204 basic_stats_functions = {'mean': lambda(x): numpy.mean(x), | |
205 'min': lambda(x): numpy.min(x), | |
206 'max': lambda(x): numpy.max(x), | |
207 'std': lambda(x): numpy.std(x)} | |
208 | |
209 class BasicStatisticsSeries(Series): | |
210 """ | |
211 Parameters | |
212 ---------- | |
213 series_name : str | |
214 Not optional here. Will be prepended with "Basic statistics for " | |
215 stats_functions : dict, optional | |
216 Dictionary with a function for each key "mean", "min", "max", "std". The function must take whatever is passed to append(...) and return a single number (float). | |
217 """ | |
218 def __init__(self, table_name, hdf5_file, stats_functions=basic_stats_functions, index_names=('epoch',), title="", hdf5_group='/'): | |
219 Series.__init__(self, table_name, hdf5_file, index_names, title) | |
220 | |
221 self.hdf5_group = hdf5_group | |
222 | |
223 self.stats_functions = stats_functions | |
224 | |
225 self._construct_table() | |
226 | |
227 def _construct_table(self): | |
228 table_description = _BasicStatisticsSeries_construct_table_toexec(self.index_names) | |
229 | |
230 self._table = self.hdf5_file.createTable(self.hdf5_group, self.table_name, table_description) | |
231 | |
232 def append(self, index, array): | |
233 """ | |
234 Parameters | |
235 ---------- | |
236 index : tuple of int | |
237 Following index_names passed to __init__, e.g. (12, 15) if index_names were ('epoch', 'minibatch_size') | |
238 array | |
239 Is of whatever type the stats_functions passed to __init__ can take. Default is anything numpy.mean(), min(), max(), std() can take. | |
240 """ | |
241 if len(index) != len(self.index_names): | |
242 raise ValueError("index provided does not have the right length (expected " \ | |
243 + str(len(self.index_names)) + " got " + str(len(index))) | |
244 | |
245 newrow = self._table.row | |
246 | |
247 for col_name, value in zip(self.index_names, index): | |
248 newrow[col_name] = value | |
249 | |
250 newrow["mean"] = self.stats_functions['mean'](array) | |
251 newrow["min"] = self.stats_functions['min'](array) | |
252 newrow["max"] = self.stats_functions['max'](array) | |
253 newrow["std"] = self.stats_functions['std'](array) | |
254 | |
255 newrow.append() | |
256 | |
257 self.hdf5_file.flush() | |
258 | |
259 class SeriesArrayWrapper(): | |
260 """ | |
261 Simply redistributes any number of elements to sub-series to respective append()s. | |
262 | |
263 To use if you have many elements to append in similar series, e.g. if you have an array containing [train_error, valid_error, test_error], and 3 corresponding series, this allows you to simply pass this array of 3 values to append() instead of passing each element to each individual series in turn. | |
264 """ | |
265 | |
266 def __init__(self, base_series_list): | |
267 self.base_series_list = base_series_list | |
268 | |
269 def append(self, index, elements): | |
270 if len(elements) != len(self.base_series_list): | |
271 raise ValueError("not enough or too much elements provided (expected " \ | |
272 + str(len(self.base_series_list)) + " got " + str(len(elements))) | |
273 | |
274 for series, el in zip(self.base_series_list, elements): | |
275 series.append(index, el) | |
276 | |
277 class SharedParamsStatisticsWrapper(SeriesArrayWrapper): | |
278 '''Save mean, min/max, std of shared parameters place in an array. | |
279 | |
280 This is specifically for cases where we have _shared_ parameters, | |
281 as we take the .value of each array''' | |
282 | |
283 def __init__(self, arrays_names, new_group_name, hdf5_file, base_group='/', index_names=('epoch',), title=""): | |
284 """ | |
285 Parameters | |
286 ---------- | |
287 array_names : array of str | |
288 Name of each array, in order of the array passed to append(). E.g. ('layer1_b', 'layer1_W', 'layer2_b', 'layer2_W') | |
289 new_group_name : str | |
290 Name of a new HDF5 group which will be created under base_group to store the new series. | |
291 base_group : str | |
292 Path of the group under which to create the new group which will store the series. | |
293 title : str | |
294 Here the title is attached to the new group, not a table. | |
295 """ | |
296 base_series_list = [] | |
297 | |
298 new_group = hdf5_file.createGroup(base_group, new_group_name, title=title) | |
299 | |
300 stats_functions = {'mean': lambda(x): numpy.mean(x.value), | |
301 'min': lambda(x): numpy.min(x.value), | |
302 'max': lambda(x): numpy.max(x.value), | |
303 'std': lambda(x): numpy.std(x.value)} | |
304 | |
305 for name in arrays_names: | |
306 base_series_list.append( | |
307 BasicStatisticsSeries( | |
308 table_name=name, | |
309 hdf5_file=hdf5_file, | |
310 index_names=index_names, | |
311 stats_functions=stats_functions, | |
312 hdf5_group=new_group._v_pathname)) | |
313 | |
314 SeriesArrayWrapper.__init__(self, base_series_list) | |
315 | |
316 |