comparison utils/scalar_series/series.py @ 186:d364a130b221

Ajout du code de base pour scalar_series. Modifications à stacked_dae: réglé un problème avec les input_divider (empêchait une optimisation), et ajouté utilisation des séries. Si j'avais pas déjà commité, aussi, j'ai enlevé l'histoire de réutilisation du pretraining: c'était compliqué (error prone) et ça créait des jobs beaucoup trop longues.
author fsavard
date Mon, 01 Mar 2010 11:45:25 -0500
parents
children
comparison
equal deleted inserted replaced
185:b9ea8e2d071a 186:d364a130b221
1 #!/usr/bin/python
2 # coding: utf-8
3
4 from __future__ import with_statement
5
6 import sys
7 import os
8 import os.path
9 import array
10
11 # for BasicStatsSeries
12 import numpy
13
14 # To access .value if necessary
15 import theano.tensor.sharedvar
16
17 '''
18 * TODO: add xy series
19 * TODO: add graph() for base and accumulator
20 * TODO: flush_every for BaseStatsSeries
21 * TODO: warn when Mux append() is called with a nonexisting name
22 * SeriesContainers are also series, albeit with more complex elements appended
23 * Each series has a "name" which corresponds in some way to the directory or file in which it's saved
24 '''
25
26 # Simple class to append numbers and flush them to a file once in a while
27 class BaseSeries():
28 # for types, see http://docs.python.org/library/array.html
29 def __init__(self, name, directory, type='f', flush_every=1):
30 self.type = type
31 self.flush_every = flush_every
32
33 if not name or not directory:
34 raise Exception("name and directory must be provided (strings)")
35
36 self.directory = directory
37 self.name = name
38
39 if name and directory:
40 self.filepath = os.path.join(directory, name)
41
42 self._array = array.array(type)
43 # stores the length not stored in file, waiting to be flushed
44 self._buffered = 0
45
46 def append(self, newitem):
47 self._array.append(newitem)
48
49 self._buffered += 1
50 if self._buffered >= self.flush_every:
51 self.flush()
52
53 def append_list(self, items):
54 self._array.fromlist(items)
55 self._buffered += len(items)
56 if self._buffered >= self.flush_every:
57 self.flush()
58
59 def flush(self):
60 if self._buffered == 0:
61 return
62 with open(self.filepath, "wb") as f:
63 s = self._array[-self._buffered:].tostring()
64 f.write(s)
65
66 def tolist(self):
67 return self._array.tolist()
68
69 def load_from_file(self):
70 if not self.filepath:
71 raise Exception("No name/directory provided")
72
73 self._array = array.array(self.type)
74 self._buffered = 0
75
76 statinfo = os.stat(self.filepath)
77 size = statinfo.st_size
78 num_items = size / self._array.itemsize
79
80 with open(self.filepath, "rb") as f:
81 self._array.fromfile(f, num_items)
82
83 class AccumulatorSeries(BaseSeries):
84 '''
85 reduce_every: group (sum or mean) the last "reduce_every" items whenever we have enough
86 and create a new item added to the real, saved array
87 (if elements remain at the end, less then "reduce_every", they'll be discarded on program close)
88 flush_every: this is for items of the real, saved array, not in terms of number of calls to "append"
89 '''
90 def __init__(self, reduce_every,
91 name, directory, flush_every=1,
92 mean=False):
93 BaseSeries.__init__(self, name=name, directory=directory, type='f', flush_every=flush_every)
94 self.reduce_every = reduce_every
95 self._accumulator = 0.0
96 self._num_accumulated = 0
97 self.use_mean = mean
98
99 @classmethod
100 def series_constructor(cls, reduce_every, mean=False):
101 def cstr(name, directory, flush_every=1):
102 return cls(reduce_every=reduce_every, mean=mean, name=name, directory=directory, flush_every=flush_every)
103 return cstr
104
105 def append(self, item):
106 self._accumulator += item
107 self._num_accumulated += 1
108 if self._num_accumulated >= self.reduce_every:
109 n = self._accumulator
110 if self.use_mean:
111 n = n / self.reduce_every
112 BaseSeries.append(self, n)
113
114 self._num_accumulated = 0
115 self._accumulator = 0.0
116
117 def append_list(self, items):
118 for i in items:
119 self.append(i)
120
121 class SeriesContainer():
122 def __init__(self, parent_directory, name,
123 series_constructor=BaseSeries):
124 self.parent_directory = parent_directory
125 self.name = name
126
127 if not parent_directory or not name:
128 raise Exception("parent_directory and name must be provided (strings)")
129
130 self.directory_path = os.path.join(parent_directory, name)
131
132 self.series_constructor = series_constructor
133
134 # attempt to create directory for series
135 if not os.path.isdir(self.directory_path):
136 os.mkdir(self.directory_path)
137
138 def graph(self):
139 pass
140
141 class BasicStatsSeries(SeriesContainer):
142 def __init__(self, parent_directory, name, series_constructor=BaseSeries,
143 mean=True, minmax=True, std=True):
144 SeriesContainer.__init__(self, parent_directory=parent_directory, name=name, series_constructor=series_constructor)
145
146 self.save_mean = mean
147 self.save_minmax = minmax
148 self.save_std = std
149
150 self.create_series()
151
152 @classmethod
153 def series_constructor(cls, mean=True, minmax=True, std=True):
154 def cstr(name, directory, flush_every=1):
155 return cls(name=name, parent_directory=directory,
156 mean=mean, minmax=minmax, std=std)
157 return cstr
158
159
160 def create_series(self):
161 if self.save_mean:
162 self.means = self.series_constructor(name="mean", directory=self.directory_path)
163
164 if self.save_minmax:
165 self.mins = self.series_constructor(name="min", directory=self.directory_path)
166 self.maxes = self.series_constructor(name="max", directory=self.directory_path)
167
168 if self.save_std:
169 self.stds = self.series_constructor(name="std", directory=self.directory_path)
170
171 def append(self, array):
172 # TODO: shouldn't this be the job of the caller? (at least ParamsArraySeries)
173 if isinstance(array, theano.tensor.sharedvar.TensorSharedVariable):
174 array = array.value
175
176 if self.save_mean:
177 n = numpy.mean(array)
178 self.means.append(n)
179 if self.save_minmax:
180 n = numpy.min(array)
181 self.mins.append(n)
182 n = numpy.max(array)
183 self.maxes.append(n)
184 if self.save_std:
185 n = numpy.std(array)
186 self.stds.append(n)
187
188 def load_from_file(self):
189 self.load_from_directory()
190
191 def load_from_directory(self):
192 if self.save_mean:
193 self.means.load_from_file()
194
195 if self.save_minmax:
196 self.mins.load_from_file()
197 self.maxes.load_from_file()
198
199 if self.save_std:
200 self.stds.load_from_file()
201
202 def graph(self, xes=None):
203 import pylab
204
205 if self.save_minmax:
206 mn = numpy.array(self.mins.tolist())
207 mx = numpy.array(self.maxes.tolist())
208 if self.save_mean:
209 y = numpy.array(self.means.tolist())
210 else:
211 y = (mn+mx) / 2
212
213 above_y = mx - y
214 below_y = y - mn
215
216 if not xes:
217 xes = numpy.arange(len(y))
218
219 pylab.errorbar(x=xes, y=y, yerr=[below_y, above_y])
220
221 elif self.save_mean:
222 y = numpy.array(self.means.tolist())
223 if not xes:
224 xes = numpy.arange(len(y))
225
226 pylab.plot(x=xes, y=y)
227
228
229 class SeriesMultiplexer():
230 def __init__(self):
231 self._series_dict = {}
232 self._warned_for = {}
233
234 def append(self, series_name, item):
235 # if we don't have the series, just don't do anything
236 if self._series_dict.has_key(series_name):
237 s = self._series_dict[series_name]
238 s.append(item)
239 elif not self._warned_for.has_key(series_name):
240 print "WARNING: SeriesMultiplexer called with unknown name ", series_name
241 self._warned_for[series_name] = 1
242
243 def append_list(self, series_name, items):
244 if self._series_dict.has_key(series_name):
245 s = self._series_dict[series_name]
246 s.append_list(items)
247 elif not self._warned_for.has_key(series_name):
248 print "WARNING: SeriesMultiplexer called with unknown name ", series_name
249 self._warned_for[series_name] = 1
250
251 def add_series(self, series):
252 if self._series_dict.has_key(series.name):
253 raise Exception("A series with such a name already exists")
254 self._series_dict[series.name] = series
255
256 class SeriesList():
257 def __init__(self, num_elements, name, directory, series_constructor=BaseSeries):
258 self._subseries = [None] * num_elements
259 self.name = name
260
261 for i in range(num_elements):
262 newname = name + "." + str(i)
263 self._subseries[i] = series_constructor(name=newname, directory=directory)
264
265 def load_from_files(self):
266 self.load_from_file()
267
268 def load_from_file(self):
269 for s in self._subseries:
270 s.load_from_file()
271
272 # no "append_list", this would get confusing
273 def append(self, list_of_items):
274 if len(list_of_items) != len(self._subseries):
275 raise Exception("bad number of items, expected " + str(len(self._subseries)) + ", got " + str(len(list_of_items)))
276 for i in range(len(list_of_items)):
277 self._subseries[i].append(list_of_items[i])
278
279
280 # Just a shortcut
281 class ParamsArrayStats(SeriesList):
282 def __init__(self, num_params_arrays, name, directory):
283 cstr = BasicStatsSeries.series_constructor()
284
285 SeriesList.__init__(self, num_elements=num_params_arrays,
286 name=name, directory=directory,
287 series_constructor=cstr)
288
289 # ------------------------
290 # Utilities to work with the series files from the command line
291
292 # "dumpf"
293 def dump_floats_file(filepath):
294 print "Floats dump of ", filepath
295 with open(filepath, "rb") as f:
296 s = os.stat(filepath)
297 size = s.st_size
298 num = size / 4
299 a = array.array('f')
300 a.fromfile(f, num)
301 print a.tolist()
302
303 if __name__ == '__main__':
304 args = sys.argv[1:]
305
306 if len(args) == 2 and args[0] == "dumpf":
307 file = args[1]
308 dump_floats_file(file)
309 else:
310 print "Bad arguments"
311