changeset 208:acb942530923

Completely rewrote my series module, now based on HDF5 and PyTables (in a separate directory called 'tables_series' for retrocompatibility of running code). Minor (inconsequential) changes to stacked_dae.
author fsavard
date Fri, 05 Mar 2010 18:07:20 -0500
parents 10a801240bfc
children d982dfa583df
files deep/stacked_dae/nist_sda.py deep/stacked_dae/sgd_optimization.py deep/stacked_dae/stacked_dae.py utils/tables_series/__init__.py utils/tables_series/series.py utils/tables_series/test_series.py
diffstat 6 files changed, 417 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/deep/stacked_dae/nist_sda.py	Thu Mar 04 08:21:43 2010 -0500
+++ b/deep/stacked_dae/nist_sda.py	Fri Mar 05 18:07:20 2010 -0500
@@ -64,7 +64,7 @@
                        'pretraining_lr':0.1,
                        'pretraining_epochs_per_layer':20,
                        'max_finetuning_epochs':2,
-                       'hidden_layers_sizes':300,
+                       'hidden_layers_sizes':800,
                        'corruption_levels':0.2,
                        'minibatch_size':20,
                        #'reduce_train_to':300,
--- a/deep/stacked_dae/sgd_optimization.py	Thu Mar 04 08:21:43 2010 -0500
+++ b/deep/stacked_dae/sgd_optimization.py	Fri Mar 05 18:07:20 2010 -0500
@@ -86,6 +86,8 @@
                           finetune_lr = self.hp.finetuning_lr,\
                           input_divider = self.input_divider )
 
+        #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph")
+
         sys.stdout.flush()
 
     def train(self):
@@ -96,6 +98,9 @@
         print "STARTING PRETRAINING, time = ", datetime.datetime.now()
         sys.stdout.flush()
 
+        #time_acc_func = 0.0
+        #time_acc_total = 0.0
+
         start_time = time.clock()  
         ## Pre-train layer-wise 
         for i in xrange(self.classifier.n_layers):
@@ -103,7 +108,14 @@
             for epoch in xrange(self.hp.pretraining_epochs_per_layer):
                 # go through the training set
                 for batch_index in xrange(self.n_train_batches):
+                    #t1 = time.clock()
                     c = self.classifier.pretrain_functions[i](batch_index)
+                    #t2 = time.clock()
+
+                    #time_acc_func += t2 - t1
+
+                    #if batch_index % 500 == 0:
+                    #    print "acc / total", time_acc_func / (t2 - start_time), time_acc_func
 
                     self.series_mux.append("reconstruction_error", c)
                         
--- a/deep/stacked_dae/stacked_dae.py	Thu Mar 04 08:21:43 2010 -0500
+++ b/deep/stacked_dae/stacked_dae.py	Fri Mar 05 18:07:20 2010 -0500
@@ -140,6 +140,11 @@
     #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
     #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
 
+    # bypassing z to avoid running to log(0)
+    #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime)
+    #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \
+    #                + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 )
+
     # I added this epsilon to avoid getting log(0) and 1/0 in grad
     # This means conceptually that there'd be no probability of 0, but that
     # doesn't seem to me as important (maybe I'm wrong?).
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/tables_series/__init__.py	Fri Mar 05 18:07:20 2010 -0500
@@ -0,0 +1,2 @@
+from series import ErrorSeries, BasicStatisticsSeries, AccumulatorSeriesWrapper, SeriesArrayWrapper, ParamsStatisticsWrapper
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/tables_series/series.py	Fri Mar 05 18:07:20 2010 -0500
@@ -0,0 +1,227 @@
+from tables import *
+import numpy
+
+'''
+The way these "IsDescription constructor" work is simple: write the
+code as if it were in a file, then exec()ute it, leaving us with
+a local-scoped LocalDescription which may be used to call createTable.
+
+It's a small hack, but it's necessary as the names of the columns
+are retrieved based on the variable name, which we can't programmatically set
+otherwise.
+'''
+
+def get_beginning_description_n_ints(int_names, int_width=64):
+    int_constructor = "Int64Col"
+    if int_width == 32:
+        int_constructor = "Int32Col"
+
+    toexec = "class LocalDescription(IsDescription):\n"
+
+    pos = 0
+
+    for n in int_names:
+        toexec += "\t" + n + " = " + int_constructor + "(pos=" + str(pos) + ")\n"
+
+    return toexec
+
+def get_description_with_n_ints_n_floats(int_names, float_names, int_width=64, float_width=32):
+    """
+    Constructs a class to be used when constructing a table with PyTables.
+
+    This is useful to construct a series with an index with multiple levels.
+    E.g. if you want to index your "validation error" with "epoch" first, then
+    "minibatch_index" second, you'd use two "int_names".
+
+    Parameters
+    ----------
+    int_names : tuple of str
+        Names of the int (e.g. index) columns
+    float_names : tuple of str
+        Names of the float (e.g. error) columns
+    int_width : {'32', '64'}
+        Type of ints.
+    float_width : {'32', '64'}
+        Type of floats.
+
+    Returns
+    -------
+    A class object, to pass to createTable()
+    """
+
+    toexec = get_beginning_description_n_ints(int_names, int_width=int_width)
+
+    float_constructor = "Float32Col"
+    if float_width == 64:
+        float_constructor = "Float64Col"
+    
+    pos = len(int_names)
+
+    for n in float_names:
+        toexec += "\t" + n + " = " + float_constructor + "(pos=" + str(pos) + ")\n"
+
+    exec(toexec)
+
+    return LocalDescription
+
+class Series():
+    def __init__(self, table_name, hdf5_file, index_names=('epoch',), title=None, hdf5_group='/'):
+        """This is used as metadata in the HDF5 file to identify the series"""
+        self.table_name = table_name
+        self.hdf5_file = hdf5_file
+        self.index_names = index_names
+        self.title = title
+
+    def append(self, index, element):
+        raise NotImplementedError
+
+class ErrorSeries(Series):
+    def __init__(self, error_name, table_name, hdf5_file, index_names=('epoch',), title=None, hdf5_group='/'):
+        Series.__init__(self, table_name, hdf5_file, index_names, title)
+
+        self.error_name = error_name
+
+        table_description = self._get_table_description()
+
+        self._table = hdf5_file.createTable(hdf5_group, self.table_name, table_description, title=title)
+
+    def _get_table_description(self):
+        return get_description_with_n_ints_n_floats(self.index_names, (self.error_name,))
+
+    def append(self, index, error):
+        if len(index) != len(self.index_names):
+            raise ValueError("index provided does not have the right length (expected " \
+                            + str(len(self.index_names)) + " got " + str(len(index)))
+
+        newrow = self._table.row
+
+        for col_name, value in zip(self.index_names, index):
+            newrow[col_name] = value
+        newrow[self.error_name] = error
+
+        newrow.append()
+
+        self.hdf5_file.flush()
+
+# Does not inherit from Series because it does not itself need to
+# access the hdf5_file and does not need a series_name (provided
+# by the base_series.)
+class AccumulatorSeriesWrapper():
+    """
+    
+    """
+    def __init__(self, base_series, reduce_every, reduce_function=numpy.mean):
+        self.base_series = base_series
+        self.reduce_function = reduce_function
+        self.reduce_every = reduce_every
+
+        self._buffer = []
+
+    
+    def append(self, index, element):
+        """
+        Parameters
+        ----------
+        index : tuple of int
+            The index used is the one of the last element reduced. E.g. if
+            you accumulate over the first 1000 minibatches, the index
+            passed to the base_series.append() function will be 1000.
+        """
+        self._buffer.append(element)
+
+        if len(self._buffer) == self.reduce_every:
+            reduced = self.reduce_function(self._buffer)
+            self.base_series.append(index, reduced)
+            self._buffer = []
+
+        # This should never happen, except if lists
+        # were appended, which should be a red flag.
+        assert len(self._buffer) < self.reduce_every
+
+# Outside of class to fix an issue with exec in Python 2.6.
+# My sorries to the God of pretty code.
+def BasicStatisticsSeries_construct_table_toexec(index_names):
+    toexec = get_beginning_description_n_ints(index_names)
+
+    bpos = len(index_names)
+    toexec += "\tmean = Float32Col(pos=" + str(bpos) + ")\n"
+    toexec += "\tmin = Float32Col(pos=" + str(bpos+1) + ")\n"
+    toexec += "\tmax = Float32Col(pos=" + str(bpos+2) + ")\n"
+    toexec += "\tstd = Float32Col(pos=" + str(bpos+3) + ")\n"
+    
+    # This creates "LocalDescription", which we may then use
+    exec(toexec)
+
+    return LocalDescription
+
+class BasicStatisticsSeries(Series):
+    """
+    Parameters
+    ----------
+    series_name : str
+        Not optional here. Will be prepended with "Basic statistics for "
+    """
+    def __init__(self, table_name, hdf5_file, index_names=('epoch',), title=None, hdf5_group='/'):
+        Series.__init__(self, table_name, hdf5_file, index_names, title)
+
+        self.hdf5_group = hdf5_group
+
+        self.construct_table()
+
+    def construct_table(self):
+        table_description = BasicStatisticsSeries_construct_table_toexec(self.index_names)
+
+        self._table = self.hdf5_file.createTable(self.hdf5_group, self.table_name, table_description)
+
+    def append(self, index, array):
+        if len(index) != len(self.index_names):
+            raise ValueError("index provided does not have the right length (expected " \
+                            + str(len(self.index_names)) + " got " + str(len(index)))
+
+        newrow = self._table.row
+
+        for col_name, value in zip(self.index_names, index):
+            newrow[col_name] = value
+
+        newrow["mean"] = numpy.mean(array)
+        newrow["min"] = numpy.min(array)
+        newrow["max"] = numpy.max(array)
+        newrow["std"] = numpy.std(array)
+
+        newrow.append()
+
+        self.hdf5_file.flush()
+
+class SeriesArrayWrapper():
+    """
+    Simply redistributes any number of elements to sub-series to respective append()s.
+    """
+
+    def __init__(self, base_series_list):
+        self.base_series_list = base_series_list
+
+    def append(self, index, elements):
+        if len(elements) != len(self.base_series_list):
+            raise ValueError("not enough or too much elements provided (expected " \
+                            + str(len(self.base_series_list)) + " got " + str(len(elements)))
+
+        for series, el in zip(self.base_series_list, elements):
+            series.append(index, el)
+
+class ParamsStatisticsWrapper(SeriesArrayWrapper):
+    def __init__(self, arrays_names, new_group_name, hdf5_file, base_group='/', index_names=('epoch',), title=""):
+        base_series_list = []
+
+        new_group = hdf5_file.createGroup(base_group, new_group_name, title=title)
+
+        for name in arrays_names:
+            base_series_list.append(
+                        BasicStatisticsSeries(
+                                table_name=name,
+                                hdf5_file=hdf5_file,
+                                index_names=('epoch','minibatch'),
+                                hdf5_group=new_group._v_pathname))
+
+        SeriesArrayWrapper.__init__(self, base_series_list)
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/tables_series/test_series.py	Fri Mar 05 18:07:20 2010 -0500
@@ -0,0 +1,170 @@
+import tempfile
+import numpy
+import numpy.random
+from tables import *
+
+from series import *
+
+
+def compare_floats(f1,f2):
+    if f1-f2 < 1e-3:
+        return True
+    return False
+
+def compare_lists(it1, it2, floats=False):
+    if len(it1) != len(it2):
+        return False
+
+    for el1,  el2 in zip(it1, it2):
+        if floats:
+            if not compare_floats(el1,el2):
+                return False
+        elif el1 != el2:
+            return False
+
+    return True
+
+def test_ErrorSeries_common_case(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = openFile(h5f_path, "w")
+
+    validation_error = ErrorSeries(error_name="validation_error", table_name="validation_error",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                title="Validation error indexed by epoch and minibatch")
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append((1,1), 32.0)
+    validation_error.append((1,2), 30.0)
+    validation_error.append((2,1), 28.0)
+    validation_error.append((2,2), 26.0)
+
+    h5f.close()
+
+    h5f = openFile(h5f_path, "r")
+    
+    table = h5f.getNode('/', 'validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [1,1,2,2])
+    assert compare_lists(table.cols.minibatch[:], [1,2,1,2])
+    assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0])
+
+def test_AccumulatorSeriesWrapper_common_case(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = openFile(h5f_path, "w")
+
+    validation_error = ErrorSeries(error_name="accumulated_validation_error",
+                                table_name="accumulated_validation_error",
+                                hdf5_file=h5f,
+                                index_names=('epoch','minibatch'),
+                                title="Validation error, summed every 3 minibatches, indexed by epoch and minibatch")
+
+    accumulator = AccumulatorSeriesWrapper(base_series=validation_error,
+                                    reduce_every=3, reduce_function=numpy.sum)
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    accumulator.append((1,1), 32.0)
+    accumulator.append((1,2), 30.0)
+    accumulator.append((2,1), 28.0)
+    accumulator.append((2,2), 26.0)
+    accumulator.append((3,1), 24.0)
+    accumulator.append((3,2), 22.0)
+
+    h5f.close()
+
+    h5f = openFile(h5f_path, "r")
+    
+    table = h5f.getNode('/', 'accumulated_validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [2,3])
+    assert compare_lists(table.cols.minibatch[:], [1,2])
+    assert compare_lists(table.cols.accumulated_validation_error[:], [90.0,72.0], floats=True)
+
+def test_BasicStatisticsSeries_common_case(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = openFile(h5f_path, "w")
+
+    stats_series = BasicStatisticsSeries(table_name="b_vector_statistics",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                title="Basic statistics for b vector indexed by epoch and minibatch")
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    stats_series.append((1,1), [0.15, 0.20, 0.30])
+    stats_series.append((1,2), [-0.18, 0.30, 0.58])
+    stats_series.append((2,1), [0.18, -0.38, -0.68])
+    stats_series.append((2,2), [0.15, 0.02, 1.9])
+
+    h5f.close()
+
+    h5f = openFile(h5f_path, "r")
+    
+    table = h5f.getNode('/', 'b_vector_statistics')
+
+    assert compare_lists(table.cols.epoch[:], [1,1,2,2])
+    assert compare_lists(table.cols.minibatch[:], [1,2,1,2])
+    assert compare_lists(table.cols.mean[:], [0.21666667,  0.23333333, -0.29333332,  0.69], floats=True)
+    assert compare_lists(table.cols.min[:], [0.15000001, -0.18000001, -0.68000001,  0.02], floats=True)
+    assert compare_lists(table.cols.max[:], [0.30, 0.58, 0.18, 1.9], floats=True)
+    assert compare_lists(table.cols.std[:], [0.06236095, 0.31382939,  0.35640177, 0.85724366], floats=True)
+
+def test_ParamsStatisticsWrapper_commoncase(h5f=None):
+    import numpy.random
+
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = openFile(h5f_path, "w")
+
+    stats = ParamsStatisticsWrapper(new_group_name="params", base_group="/",
+                                arrays_names=('b1','b2','b3'), hdf5_file=h5f,
+                                index_names=('epoch','minibatch'))
+
+    b1 = numpy.random.rand(5)
+    b2 = numpy.random.rand(5)
+    b3 = numpy.random.rand(5)
+    stats.append((1,1), [b1,b2,b3])
+
+    h5f.close()
+
+    h5f = openFile(h5f_path, "r")
+
+    b1_table = h5f.getNode('/params', 'b1')
+    b3_table = h5f.getNode('/params', 'b3')
+
+    assert b1_table.cols.mean[0] - numpy.mean(b1) < 1e-3
+    assert b3_table.cols.mean[0] - numpy.mean(b3) < 1e-3
+    assert b1_table.cols.min[0] - numpy.min(b1) < 1e-3
+    assert b3_table.cols.min[0] - numpy.min(b3) < 1e-3
+
+def test_get_desc():
+    h5f_path = tempfile.NamedTemporaryFile().name
+    h5f = openFile(h5f_path, "w")
+
+    desc = get_description_with_n_ints_n_floats(("col1","col2"), ("col3","col4"))
+
+    mytable = h5f.createTable('/', 'mytable', desc)
+
+    # just make sure the columns are there... otherwise this will throw an exception
+    mytable.cols.col1
+    mytable.cols.col2
+    mytable.cols.col3
+    mytable.cols.col4
+
+    try:
+        # this should fail... LocalDescription must be local to get_desc_etc
+        test = LocalDescription
+        assert False
+    except:
+        assert True
+
+    assert True
+
+if __name__ == '__main__':
+    import tempfile
+    test_get_desc()
+    test_ErrorSeries_common_case()
+    test_BasicStatisticsSeries_common_case()
+    test_AccumulatorSeriesWrapper_common_case()
+    test_ParamsStatisticsWrapper_commoncase()
+