changeset 608:8f40262297cf

merged
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 15 Jan 2009 22:23:43 -0500
parents 23467f473eb7 (diff) 52a99d83f06d (current diff)
children e8cb4bde30a7
files pylearn/dbdict/newstuff.py
diffstat 3 files changed, 141 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/tzanetakis.py	Thu Jan 15 22:23:43 2009 -0500
@@ -0,0 +1,102 @@
+"""
+Load Tzanetakis' genre-classification dataset.
+
+"""
+from __future__ import absolute_import
+
+import os
+import numpy
+
+from ..io.amat import AMat
+from .config import data_root
+from .dataset import dataset_factory, Dataset
+
+def head(n=10, path=None):
+    """Load the first MNIST examples.
+
+    Returns two matrices: x, y.  x has N rows of 784 columns.  Each row of x represents the
+    28x28 grey-scale pixels in raster order.  y is a vector of N integers.  Each element y[i]
+    is the label of the i'th row of x.
+    
+    """
+    path = os.path.join(data_root(), 'mnist','mnist_with_header.amat') if path is None else path
+
+    dat = AMat(path=path, head=n)
+
+    try:
+        assert dat.input.shape[0] == n
+        assert dat.target.shape[0] == n
+    except Exception , e:
+        raise Exception("failed to read MNIST data", (dat, e))
+
+    return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0])
+
+def all(path=None):
+    return head(n=None, path=path)
+
+def train_valid_test(ntrain=50000, nvalid=10000, ntest=10000, path=None):
+    all_x, all_targ = head(ntrain+nvalid+ntest, path=path)
+
+    rval = Dataset()
+
+    rval.train = Dataset.Obj(x=all_x[0:ntrain],
+            y=all_targ[0:ntrain])
+    rval.valid = Dataset.Obj(x=all_x[ntrain:ntrain+nvalid],
+            y=all_targ[ntrain:ntrain+nvalid])
+    rval.test =  Dataset.Obj(x=all_x[ntrain+nvalid:ntrain+nvalid+ntest],
+            y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest])
+
+    rval.n_classes = 10
+    rval.img_shape = (28,28)
+    return rval
+
+
+def mfcc16(segments_per_song = 1, include_covariance = True, random_split = 0,
+        ntrain = 700, nvalid = 100, ntest = 200):
+    if segments_per_song != 1:
+        raise NotImplementedError()
+
+    path = os.path.join(data_root(), 'tzanetakis','feat_mfcc16_540_1.stat.amat')
+    dat = AMat(path=path)
+    all_input = dat.input
+    assert all_input.shape == (1000 * segments_per_song, 152)
+    all_targ = numpy.tile(numpy.arange(10).reshape(10,1), 100 * segments_per_song)\
+            .reshape(1000 * segments_per_song)
+
+    if not include_covariance:
+        all_input = all_input[:,0:16] 
+
+    #shuffle the data according to the random split
+    assert all_input.shape[0] == all_targ.shape[0]
+    seed = random_split + 1
+    numpy.random.RandomState(seed).shuffle(all_input)
+    numpy.random.RandomState(seed).shuffle(all_targ)
+
+    #construct a dataset to return
+    rval = Dataset()
+
+    rval.train = Dataset.Obj(x=all_input[0:ntrain],
+            y=all_targ[0:ntrain])
+    rval.valid = Dataset.Obj(x=all_input[ntrain:ntrain+nvalid],
+            y=all_targ[ntrain:ntrain+nvalid])
+    rval.test =  Dataset.Obj(x=all_input[ntrain+nvalid:ntrain+nvalid+ntest],
+            y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest])
+
+    rval.n_classes = 10
+
+    return rval
+
+
+
+
+def mnist_factory(variant="", ntrain=None, nvalid=None, ntest=None):
+    if variant=="":
+        return train_valid_test()
+    elif variant=="1k":
+        return train_valid_test(ntrain=1000, nvalid=200, ntest=200)
+    elif variant=="10k":
+        return train_valid_test(ntrain=10000, nvalid=2000, ntest=2000)
+    elif variant=="custom":
+        return train_valid_test(ntrain=ntrain, nvalid=nvalid, ntest=ntest)
+    else:
+        raise Exception('Unknown MNIST variant', variant)
--- a/pylearn/dbdict/newstuff.py	Thu Jan 15 17:12:36 2009 -0500
+++ b/pylearn/dbdict/newstuff.py	Thu Jan 15 22:23:43 2009 -0500
@@ -247,6 +247,10 @@
         self.state = state
         self.feedback = None
 
+        #TODO: make this a property and disallow changing it during a with block
+        self.catch_sigterm = True
+        self.catch_sigint = True
+
     def switch(self, message = None):
         feedback = self.feedback
         self.feedback = None
@@ -266,7 +270,7 @@
         self.state.dbdict.status = self.RUNNING
 
         v = self.COMPLETE
-        with self:
+        with self: #calls __enter__ and then __exit__
             try:
                 v = self.experiment(self.state, self)
             finally:
@@ -282,10 +286,12 @@
     def __enter__(self):
         # install a SIGTERM handler that asks the experiment function to return
         # the next time it will call switch()
-        self.prev_sigterm = signal.getsignal(signal.SIGTERM)
-        self.prev_sigint = signal.getsignal(signal.SIGINT)
-        signal.signal(signal.SIGTERM, self.on_sigterm)
-        signal.signal(signal.SIGINT, self.on_sigterm)
+        if self.catch_sigterm:
+            self.prev_sigterm = signal.getsignal(signal.SIGTERM)
+            signal.signal(signal.SIGTERM, self.on_sigterm)
+        if self.catch_sigint:
+            self.prev_sigint = signal.getsignal(signal.SIGINT)
+            signal.signal(signal.SIGINT, self.on_sigterm)
         return self
 
     def __exit__(self, type, value, tb_traceback, save = True):
@@ -294,10 +300,12 @@
                 raise type, value, tb_traceback
             except:
                 traceback.print_exc()
-        signal.signal(signal.SIGTERM, self.prev_sigterm)
-        signal.signal(signal.SIGINT, self.prev_sigint)
-        self.prev_sigterm = None
-        self.prev_sigint = None
+        if self.catch_sigterm:
+            signal.signal(signal.SIGTERM, self.prev_sigterm)
+            self.prev_sigterm = None
+        if self.catch_sigint:
+            signal.signal(signal.SIGINT, self.prev_sigint)
+            self.prev_sigint = None
         if save:
             self.save()
         return True
@@ -519,6 +527,8 @@
                           help = 'the working directory in which to run the experiment')
 parser_cmdline.add_option('-n', '--dry-run', action = 'store_true', dest = 'dry_run', default = False,
                           help = 'use this option to run the whole experiment in a temporary working directory (cleaned after use)')
+parser_cmdline.add_option('-2', '--sigint', action = 'store_true', dest = 'allow_sigint', default = False,
+        help = 'allow sigint (CTRL-C) to interrupt a process')
 
 def runner_cmdline(options, experiment, *strings):
     """
@@ -551,6 +561,7 @@
                               experiment, state,
                               redirect_stdout = options.redirect or options.redirect_stdout,
                               redirect_stderr = options.redirect or options.redirect_stderr)
+    channel.catch_sigint = not options.allow_sigint
     channel.run(force = options.force)
     if options.dry_run:
         shutil.rmtree(workdir, ignore_errors=True)
--- a/pylearn/io/amat.py	Thu Jan 15 17:12:36 2009 -0500
+++ b/pylearn/io/amat.py	Thu Jan 15 22:23:43 2009 -0500
@@ -1,4 +1,22 @@
-"""load PLearn AMat files"""
+"""load PLearn AMat files
+
+
+An AMat file is an ascii format for dense matrices.
+
+The format is not precisely defined, so I'll describe here a single recipe for making a valid
+file.
+
+.. code-block:: text
+    
+    #size: <rows> <cols>
+    #sizes: <input cols> <target cols> <weight cols> <extra cols 0> <extra cols 1> <extra cols ...>
+    number number number ....
+    number number number ....
+
+
+Tabs and spaces are both valid delimiters.  Newlines separate consecutive rows.
+
+"""
 
 import sys, numpy, array