changeset 1465:490616262500

Adding datasets used in Hugo's NADE paper. Datasets have been converted from .libsvm format and stored under $PYLEARN_DATA_ROOT/larocheh, using numpy.save in uint8 format (data is all binary-valued).
author gdesjardins
date Wed, 20 Apr 2011 16:30:48 -0400
parents c9179b0ed002
children 4d6d6d4eab9e
files pylearn/datasets/nade.py
diffstat 1 files changed, 59 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/nade.py	Wed Apr 20 16:30:48 2011 -0400
@@ -0,0 +1,59 @@
+import os
+import numpy
+
+from pylearn.io.pmat import PMat
+from pylearn.datasets.config import data_root # config
+from pylearn.datasets.dataset import Dataset
+
+def load_dataset(name):
+    """
+    Various datasets which were used in the following paper.
+    The Neural Autoregressive Distribution Estimator
+    Hugo Larochelle and Iain Murray, AISTATS 2011
+
+    :param name: string specifying which dataset to load
+    :return: Dataset object
+    dataset.train.x: matrix of training data of shape (num_examples, ndim)
+    dataset.train.y: vector of training labels of length num_examples. Labels are
+                     integer valued and represent the class it belongs too.
+    dataset.valid.x: idem for validation data
+    dataset.valid.y: idem for validation data
+    dataset.test.x: idem for test data
+    dataset.test.y: idem for test data
+
+    WARNING: class labels are integer-valued instead of 1-of-n encoding !
+    """
+    assert name in ['adult','binarized_mnist', 'mnist', 'connect4','dna',
+                    'mushrooms','nips','ocr_letters','rcv1','web']
+    rval = Dataset()
+    
+    path = os.path.join(data_root(), 'larocheh', name)
+
+    # load training set
+    x=numpy.load(os.path.join(path,'train_data.npy'))
+    y_fname = os.path.join(path, 'train_labels.npy')
+    if os.path.exists(y_fname):
+        y = numpy.load(os.path.join(path,'train_labels.npy'))
+    else:
+        y = None
+    rval.train = Dataset.Obj(x=x, y=y)
+ 
+    # load validation set
+    x=numpy.load(os.path.join(path,'valid_data.npy'))
+    y_fname = os.path.join(path, 'valid_labels.npy')
+    if os.path.exists(y_fname):
+        y = numpy.load(os.path.join(path,'valid_labels.npy'))
+    else:
+        y = None
+    rval.valid = Dataset.Obj(x=x, y=y)
+                             
+    # load training set
+    x=numpy.load(os.path.join(path,'test_data.npy'))
+    y_fname = os.path.join(path, 'test_labels.npy')
+    if os.path.exists(y_fname):
+        y = numpy.load(os.path.join(path,'test_labels.npy'))
+    else:
+        y = None
+    rval.test = Dataset.Obj(x=x, y=y)
+
+    return rval