diff pylearn/datasets/MNIST.py @ 537:b054271b2504

new file structure layout, factories, etc.
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 12 Nov 2008 21:57:54 -0500
parents datasets/MNIST.py@58810b63292b
children 16f91ca016b1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/MNIST.py	Wed Nov 12 21:57:54 2008 -0500
@@ -0,0 +1,64 @@
+"""
+Various routines to load/access MNIST data.
+"""
+from __future__ import absolute_import
+
+import os
+import numpy
+
+from ..io.amat import AMat
+from .config import data_root
+from .dataset import dataset_factory, Dataset
+
+def head(n=10, path=None):
+    """Load the first MNIST examples.
+
+    Returns two matrices: x, y.  x has N rows of 784 columns.  Each row of x represents the
+    28x28 grey-scale pixels in raster order.  y is a vector of N integers.  Each element y[i]
+    is the label of the i'th row of x.
+    
+    """
+    path = os.path.join(data_root(), 'mnist','mnist_with_header.amat') if path is None else path
+
+    dat = AMat(path=path, head=n)
+
+    try:
+        assert dat.input.shape[0] == n
+        assert dat.target.shape[0] == n
+    except Exception , e:
+        raise Exception("failed to read MNIST data", (dat, e))
+
+    return dat.input, numpy.asarray(dat.target, dtype='int64').reshape(dat.target.shape[0])
+
+def all(path=None):
+    return head(n=None, path=path)
+
+def train_valid_test(ntrain=50000, nvalid=10000, ntest=10000, path=None):
+    all_x, all_targ = head(ntrain+nvalid+ntest, path=path)
+
+    rval = Dataset()
+
+    rval.train = Dataset.Obj(x=all_x[0:ntrain],
+            y=all_targ[0:ntrain])
+    rval.valid = Dataset.Obj(x=all_x[ntrain:ntrain+nvalid],
+            y=all_targ[ntrain:ntrain+nvalid])
+    rval.test =  Dataset.Obj(x=all_x[ntrain+nvalid:ntrain+nvalid+ntest],
+            y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest])
+
+    rval.n_classes = 10
+    return rval
+
+
+
+@dataset_factory('MNIST')
+def mnist_factory(variant="", ntrain=None, nvalid=None, ntest=None):
+    if variant=="":
+        return train_valid_test()
+    elif variant=="1k":
+        return train_valid_test(ntrain=1000, nvalid=200, ntest=200)
+    elif variant=="10k":
+        return train_valid_test(ntrain=10000, nvalid=2000, ntest=2000)
+    elif variant=="custom":
+        return train_valid_test(ntrain=ntrain, nvalid=nvalid, ntest=ntest)
+    else:
+        raise Exception('Unknown MNIST variant', variant)