changeset 640:af14b1f32882

revised tzanetakis, added data centering
author James Bergstra <bergstrj@iro.umontreal.ca>
date Sat, 24 Jan 2009 17:24:55 -0500
parents 83397981a118
children ac6e7ce28f70
files pylearn/datasets/tzanetakis.py
diffstat 1 files changed, 14 insertions(+), 55 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/tzanetakis.py	Wed Jan 21 16:02:07 2009 -0500
+++ b/pylearn/datasets/tzanetakis.py	Sat Jan 24 17:24:55 2009 -0500
@@ -11,48 +11,17 @@
 from .config import data_root
 from .dataset import dataset_factory, Dataset
 
-def head(n=10, path=None):
-    """Load the first MNIST examples.
-
-    Returns two matrices: x, y.  x has N rows of 784 columns.  Each row of x represents the
-    28x28 grey-scale pixels in raster order.  y is a vector of N integers.  Each element y[i]
-    is the label of the i'th row of x.
-    
-    """
-    path = os.path.join(data_root(), 'mnist','mnist_with_header.amat') if path is None else path
-
-    dat = AMat(path=path, head=n)
-
-    try:
-        assert dat.input.shape[0] == n
-        assert dat.target.shape[0] == n
-    except Exception , e:
-        raise Exception("failed to read MNIST data", (dat, e))
-
-    return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0])
-
-def all(path=None):
-    return head(n=None, path=path)
-
-def train_valid_test(ntrain=50000, nvalid=10000, ntest=10000, path=None):
-    all_x, all_targ = head(ntrain+nvalid+ntest, path=path)
-
-    rval = Dataset()
-
-    rval.train = Dataset.Obj(x=all_x[0:ntrain],
-            y=all_targ[0:ntrain])
-    rval.valid = Dataset.Obj(x=all_x[ntrain:ntrain+nvalid],
-            y=all_targ[ntrain:ntrain+nvalid])
-    rval.test =  Dataset.Obj(x=all_x[ntrain+nvalid:ntrain+nvalid+ntest],
-            y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest])
-
-    rval.n_classes = 10
-    rval.img_shape = (28,28)
+def centre_data(x, inplace=False):
+    rval = x if inplace else x.copy()
+    #zero-mean
+    rval -= numpy.mean(rval, axis=0)
+    #unit-variance
+    rval *= 1.0 / (1.0e-6 + numpy.std(rval, axis=0))
     return rval
 
-
 def mfcc16(segments_per_song = 1, include_covariance = True, random_split = 0,
-        ntrain = 700, nvalid = 100, ntest = 200):
+        ntrain = 700, nvalid = 100, ntest = 200,
+        normalize=True):
     if segments_per_song != 1:
         raise NotImplementedError()
 
@@ -75,11 +44,14 @@
     #construct a dataset to return
     rval = Dataset()
 
-    rval.train = Dataset.Obj(x=all_input[0:ntrain],
+    def prepx(x):
+        return centre_data(x, inplace=True) if normalize else x
+
+    rval.train = Dataset.Obj(x=prepx(all_input[0:ntrain]),
             y=all_targ[0:ntrain])
-    rval.valid = Dataset.Obj(x=all_input[ntrain:ntrain+nvalid],
+    rval.valid = Dataset.Obj(x=prepx(all_input[ntrain:ntrain+nvalid]),
             y=all_targ[ntrain:ntrain+nvalid])
-    rval.test =  Dataset.Obj(x=all_input[ntrain+nvalid:ntrain+nvalid+ntest],
+    rval.test =  Dataset.Obj(x=prepx(all_input[ntrain+nvalid:ntrain+nvalid+ntest]),
             y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest])
 
     rval.n_classes = 10
@@ -87,16 +59,3 @@
     return rval
 
 
-
-
-def mnist_factory(variant="", ntrain=None, nvalid=None, ntest=None):
-    if variant=="":
-        return train_valid_test()
-    elif variant=="1k":
-        return train_valid_test(ntrain=1000, nvalid=200, ntest=200)
-    elif variant=="10k":
-        return train_valid_test(ntrain=10000, nvalid=2000, ntest=2000)
-    elif variant=="custom":
-        return train_valid_test(ntrain=ntrain, nvalid=nvalid, ntest=ntest)
-    else:
-        raise Exception('Unknown MNIST variant', variant)