Mercurial > ift6266
comparison utils/scalar_series/series.py @ 186:d364a130b221
Ajout du code de base pour scalar_series. Modifications à stacked_dae: réglé un problème avec les input_divider (empêchait une optimisation), et ajouté utilisation des séries. Si j'avais pas déjà commité, aussi, j'ai enlevé l'histoire de réutilisation du pretraining: c'était compliqué (error prone) et ça créait des jobs beaucoup trop longues.
author | fsavard |
---|---|
date | Mon, 01 Mar 2010 11:45:25 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
185:b9ea8e2d071a | 186:d364a130b221 |
---|---|
1 #!/usr/bin/python | |
2 # coding: utf-8 | |
3 | |
4 from __future__ import with_statement | |
5 | |
6 import sys | |
7 import os | |
8 import os.path | |
9 import array | |
10 | |
11 # for BasicStatsSeries | |
12 import numpy | |
13 | |
14 # To access .value if necessary | |
15 import theano.tensor.sharedvar | |
16 | |
17 ''' | |
18 * TODO: add xy series | |
19 * TODO: add graph() for base and accumulator | |
20 * TODO: flush_every for BaseStatsSeries | |
21 * TODO: warn when Mux append() is called with a nonexisting name | |
22 * SeriesContainers are also series, albeit with more complex elements appended | |
23 * Each series has a "name" which corresponds in some way to the directory or file in which it's saved | |
24 ''' | |
25 | |
26 # Simple class to append numbers and flush them to a file once in a while | |
27 class BaseSeries(): | |
28 # for types, see http://docs.python.org/library/array.html | |
29 def __init__(self, name, directory, type='f', flush_every=1): | |
30 self.type = type | |
31 self.flush_every = flush_every | |
32 | |
33 if not name or not directory: | |
34 raise Exception("name and directory must be provided (strings)") | |
35 | |
36 self.directory = directory | |
37 self.name = name | |
38 | |
39 if name and directory: | |
40 self.filepath = os.path.join(directory, name) | |
41 | |
42 self._array = array.array(type) | |
43 # stores the length not stored in file, waiting to be flushed | |
44 self._buffered = 0 | |
45 | |
46 def append(self, newitem): | |
47 self._array.append(newitem) | |
48 | |
49 self._buffered += 1 | |
50 if self._buffered >= self.flush_every: | |
51 self.flush() | |
52 | |
53 def append_list(self, items): | |
54 self._array.fromlist(items) | |
55 self._buffered += len(items) | |
56 if self._buffered >= self.flush_every: | |
57 self.flush() | |
58 | |
59 def flush(self): | |
60 if self._buffered == 0: | |
61 return | |
62 with open(self.filepath, "wb") as f: | |
63 s = self._array[-self._buffered:].tostring() | |
64 f.write(s) | |
65 | |
66 def tolist(self): | |
67 return self._array.tolist() | |
68 | |
69 def load_from_file(self): | |
70 if not self.filepath: | |
71 raise Exception("No name/directory provided") | |
72 | |
73 self._array = array.array(self.type) | |
74 self._buffered = 0 | |
75 | |
76 statinfo = os.stat(self.filepath) | |
77 size = statinfo.st_size | |
78 num_items = size / self._array.itemsize | |
79 | |
80 with open(self.filepath, "rb") as f: | |
81 self._array.fromfile(f, num_items) | |
82 | |
83 class AccumulatorSeries(BaseSeries): | |
84 ''' | |
85 reduce_every: group (sum or mean) the last "reduce_every" items whenever we have enough | |
86 and create a new item added to the real, saved array | |
87 (if elements remain at the end, less then "reduce_every", they'll be discarded on program close) | |
88 flush_every: this is for items of the real, saved array, not in terms of number of calls to "append" | |
89 ''' | |
90 def __init__(self, reduce_every, | |
91 name, directory, flush_every=1, | |
92 mean=False): | |
93 BaseSeries.__init__(self, name=name, directory=directory, type='f', flush_every=flush_every) | |
94 self.reduce_every = reduce_every | |
95 self._accumulator = 0.0 | |
96 self._num_accumulated = 0 | |
97 self.use_mean = mean | |
98 | |
99 @classmethod | |
100 def series_constructor(cls, reduce_every, mean=False): | |
101 def cstr(name, directory, flush_every=1): | |
102 return cls(reduce_every=reduce_every, mean=mean, name=name, directory=directory, flush_every=flush_every) | |
103 return cstr | |
104 | |
105 def append(self, item): | |
106 self._accumulator += item | |
107 self._num_accumulated += 1 | |
108 if self._num_accumulated >= self.reduce_every: | |
109 n = self._accumulator | |
110 if self.use_mean: | |
111 n = n / self.reduce_every | |
112 BaseSeries.append(self, n) | |
113 | |
114 self._num_accumulated = 0 | |
115 self._accumulator = 0.0 | |
116 | |
117 def append_list(self, items): | |
118 for i in items: | |
119 self.append(i) | |
120 | |
121 class SeriesContainer(): | |
122 def __init__(self, parent_directory, name, | |
123 series_constructor=BaseSeries): | |
124 self.parent_directory = parent_directory | |
125 self.name = name | |
126 | |
127 if not parent_directory or not name: | |
128 raise Exception("parent_directory and name must be provided (strings)") | |
129 | |
130 self.directory_path = os.path.join(parent_directory, name) | |
131 | |
132 self.series_constructor = series_constructor | |
133 | |
134 # attempt to create directory for series | |
135 if not os.path.isdir(self.directory_path): | |
136 os.mkdir(self.directory_path) | |
137 | |
138 def graph(self): | |
139 pass | |
140 | |
141 class BasicStatsSeries(SeriesContainer): | |
142 def __init__(self, parent_directory, name, series_constructor=BaseSeries, | |
143 mean=True, minmax=True, std=True): | |
144 SeriesContainer.__init__(self, parent_directory=parent_directory, name=name, series_constructor=series_constructor) | |
145 | |
146 self.save_mean = mean | |
147 self.save_minmax = minmax | |
148 self.save_std = std | |
149 | |
150 self.create_series() | |
151 | |
152 @classmethod | |
153 def series_constructor(cls, mean=True, minmax=True, std=True): | |
154 def cstr(name, directory, flush_every=1): | |
155 return cls(name=name, parent_directory=directory, | |
156 mean=mean, minmax=minmax, std=std) | |
157 return cstr | |
158 | |
159 | |
160 def create_series(self): | |
161 if self.save_mean: | |
162 self.means = self.series_constructor(name="mean", directory=self.directory_path) | |
163 | |
164 if self.save_minmax: | |
165 self.mins = self.series_constructor(name="min", directory=self.directory_path) | |
166 self.maxes = self.series_constructor(name="max", directory=self.directory_path) | |
167 | |
168 if self.save_std: | |
169 self.stds = self.series_constructor(name="std", directory=self.directory_path) | |
170 | |
171 def append(self, array): | |
172 # TODO: shouldn't this be the job of the caller? (at least ParamsArraySeries) | |
173 if isinstance(array, theano.tensor.sharedvar.TensorSharedVariable): | |
174 array = array.value | |
175 | |
176 if self.save_mean: | |
177 n = numpy.mean(array) | |
178 self.means.append(n) | |
179 if self.save_minmax: | |
180 n = numpy.min(array) | |
181 self.mins.append(n) | |
182 n = numpy.max(array) | |
183 self.maxes.append(n) | |
184 if self.save_std: | |
185 n = numpy.std(array) | |
186 self.stds.append(n) | |
187 | |
188 def load_from_file(self): | |
189 self.load_from_directory() | |
190 | |
191 def load_from_directory(self): | |
192 if self.save_mean: | |
193 self.means.load_from_file() | |
194 | |
195 if self.save_minmax: | |
196 self.mins.load_from_file() | |
197 self.maxes.load_from_file() | |
198 | |
199 if self.save_std: | |
200 self.stds.load_from_file() | |
201 | |
202 def graph(self, xes=None): | |
203 import pylab | |
204 | |
205 if self.save_minmax: | |
206 mn = numpy.array(self.mins.tolist()) | |
207 mx = numpy.array(self.maxes.tolist()) | |
208 if self.save_mean: | |
209 y = numpy.array(self.means.tolist()) | |
210 else: | |
211 y = (mn+mx) / 2 | |
212 | |
213 above_y = mx - y | |
214 below_y = y - mn | |
215 | |
216 if not xes: | |
217 xes = numpy.arange(len(y)) | |
218 | |
219 pylab.errorbar(x=xes, y=y, yerr=[below_y, above_y]) | |
220 | |
221 elif self.save_mean: | |
222 y = numpy.array(self.means.tolist()) | |
223 if not xes: | |
224 xes = numpy.arange(len(y)) | |
225 | |
226 pylab.plot(x=xes, y=y) | |
227 | |
228 | |
229 class SeriesMultiplexer(): | |
230 def __init__(self): | |
231 self._series_dict = {} | |
232 self._warned_for = {} | |
233 | |
234 def append(self, series_name, item): | |
235 # if we don't have the series, just don't do anything | |
236 if self._series_dict.has_key(series_name): | |
237 s = self._series_dict[series_name] | |
238 s.append(item) | |
239 elif not self._warned_for.has_key(series_name): | |
240 print "WARNING: SeriesMultiplexer called with unknown name ", series_name | |
241 self._warned_for[series_name] = 1 | |
242 | |
243 def append_list(self, series_name, items): | |
244 if self._series_dict.has_key(series_name): | |
245 s = self._series_dict[series_name] | |
246 s.append_list(items) | |
247 elif not self._warned_for.has_key(series_name): | |
248 print "WARNING: SeriesMultiplexer called with unknown name ", series_name | |
249 self._warned_for[series_name] = 1 | |
250 | |
251 def add_series(self, series): | |
252 if self._series_dict.has_key(series.name): | |
253 raise Exception("A series with such a name already exists") | |
254 self._series_dict[series.name] = series | |
255 | |
256 class SeriesList(): | |
257 def __init__(self, num_elements, name, directory, series_constructor=BaseSeries): | |
258 self._subseries = [None] * num_elements | |
259 self.name = name | |
260 | |
261 for i in range(num_elements): | |
262 newname = name + "." + str(i) | |
263 self._subseries[i] = series_constructor(name=newname, directory=directory) | |
264 | |
265 def load_from_files(self): | |
266 self.load_from_file() | |
267 | |
268 def load_from_file(self): | |
269 for s in self._subseries: | |
270 s.load_from_file() | |
271 | |
272 # no "append_list", this would get confusing | |
273 def append(self, list_of_items): | |
274 if len(list_of_items) != len(self._subseries): | |
275 raise Exception("bad number of items, expected " + str(len(self._subseries)) + ", got " + str(len(list_of_items))) | |
276 for i in range(len(list_of_items)): | |
277 self._subseries[i].append(list_of_items[i]) | |
278 | |
279 | |
280 # Just a shortcut | |
281 class ParamsArrayStats(SeriesList): | |
282 def __init__(self, num_params_arrays, name, directory): | |
283 cstr = BasicStatsSeries.series_constructor() | |
284 | |
285 SeriesList.__init__(self, num_elements=num_params_arrays, | |
286 name=name, directory=directory, | |
287 series_constructor=cstr) | |
288 | |
289 # ------------------------ | |
290 # Utilities to work with the series files from the command line | |
291 | |
292 # "dumpf" | |
293 def dump_floats_file(filepath): | |
294 print "Floats dump of ", filepath | |
295 with open(filepath, "rb") as f: | |
296 s = os.stat(filepath) | |
297 size = s.st_size | |
298 num = size / 4 | |
299 a = array.array('f') | |
300 a.fromfile(f, num) | |
301 print a.tolist() | |
302 | |
303 if __name__ == '__main__': | |
304 args = sys.argv[1:] | |
305 | |
306 if len(args) == 2 and args[0] == "dumpf": | |
307 file = args[1] | |
308 dump_floats_file(file) | |
309 else: | |
310 print "Bad arguments" | |
311 |