changeset 855:553bf0861fb5

adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and Olshausen's image patches
author desjagui@opale.iro.umontreal.ca
date Mon, 09 Nov 2009 14:41:00 -0500
parents 62df19a86359
children 0cfbaf0c598d 6298876b2b01
files pylearn/datasets/cifar10.py pylearn/datasets/faces.py pylearn/datasets/image_patches.py
diffstat 3 files changed, 148 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/cifar10.py	Mon Nov 09 14:41:00 2009 -0500
@@ -0,0 +1,61 @@
+"""
+Various routines to load/access MNIST data.
+"""
+from __future__ import absolute_import
+
+import os
+import numpy
+import cPickle
+
+from pylearn.datasets.config import data_root # config
+from pylearn.datasets.dataset import Dataset
+
+def unpickle(file):
+    path = os.path.join(data_root(), 'cifar10', 'cifar-10-batches-py')
+    fname = os.path.join(path, file)
+    print 'loading file %s' % fname
+    fo = open(fname, 'rb')
+    dict = cPickle.load(fo)
+    fo.close()
+    return dict
+
+class cifar10():
+
+    def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000):
+        assert ntrain + nvalid <= 50000
+        assert ntest <= 10000
+
+        self.img_shape = (3,32,32)
+        self.img_size = numpy.prod(self.img_shape)
+        self.n_classes = 10
+
+        lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000
+        x = numpy.zeros((lenx,self.img_size), dtype=dtype)
+        y = numpy.zeros(lenx, dtype=dtype)
+        
+        fnames = ['data_batch_%i'%i for i in range(1,6)]
+
+        # load train and validation data
+        nloaded = 0
+        for i, fname in enumerate(fnames):
+            data = unpickle(fname)
+            x[i*10000:(i+1)*10000, :] = data['data']
+            y[i*10000:(i+1)*10000] = data['labels']
+
+            nloaded += 10000
+            if nloaded >= ntrain + nvalid + ntest: break;
+        
+        self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain])
+        self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid],
+                                 y=y[ntrain:ntrain+nvalid])
+       
+        # load test data
+        data = unpickle('test_batch')
+        self.test = Dataset.Obj(x=data['data'][0:ntest],
+                                y=data['labels'][0:ntest])
+
+    def preprocess(self, x):
+        return numpy.float64( x *1.0 / 255.0)
+
+def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200):
+    return cifar10(dtype, ntrain, nvalid, ntest)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/faces.py	Mon Nov 09 14:41:00 2009 -0500
@@ -0,0 +1,45 @@
+"""
+Various routines to load/access faces datasets.
+"""
+from __future__ import absolute_import
+
+import os
+import numpy
+import pylab as pl
+from .config import data_root # config
+from .dataset import Dataset
+
+def att(path=None, randomize=True, normalize=True):
+    path = os.path.join(data_root(), 'faces','att','orl_faces')\
+           if path is None else path
+    
+    h, w = 112, 92
+    nsubjects = 40
+    npics = 10
+
+    x = numpy.zeros((nsubjects * npics, h * w))
+    y = numpy.zeros(nsubjects * npics)
+
+    for sid in range(nsubjects):
+        sdir = os.path.join(path, 's%i'%(sid+1))
+        for n in range(npics):
+            img = pl.imread(os.path.join(sdir,'%i.pgm'%(n+1)))
+            x[sid*npics + n,:] = img[::-1,:].flatten()
+            y[sid*npics + n] = sid
+
+    if normalize:
+        x *= (1.0 / 255.0)
+
+    perm = numpy.random.permutation(len(x))
+
+    rval = Dataset()
+    rval.n_classes = nsubjects
+    rval.img_shape = (112,92)
+    rval.train = Dataset.Obj(x=x[perm,:], y=y[perm])
+
+    # Not sure how well dataset lends itself to classification (only 400 images!)
+    # therefore not too sure it makes sense to have a train/test split
+    rval.valid = None
+    rval.test = None
+
+    return rval
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/image_patches.py	Mon Nov 09 14:41:00 2009 -0500
@@ -0,0 +1,42 @@
+"""
+Routines to load/access Olshausen's image_patches
+"""
+
+import os
+import numpy
+
+from .config import data_root
+from .dataset import Dataset
+
+dirpath = os.path.join(data_root(), 'image_patches','olshausen','smaller_patches')
+
+paths = {'20by20_whiten_01': ('natural_images_patches_whiten.amat',(20,20)),
+         '12by12_whiten_01': ('natural_images_patches_whiten_12_by_12_0_1.amat',(12,12))}
+
+def load_dataset(ntrain=70000, nvalid=15000, ntest=15000, variant='20by20_whiten_01'):
+    
+    ndata = 100000
+
+    if not paths.get(variant, None):
+        raise ValueError('Unknown image_patches variant: %s' % variant)
+    if ntrain + nvalid + ntest < ndata:
+        raise ValueError('ntrain + nvalid + ntest must be smaller than %i' %ndata)
+
+    fname = os.path.join(dirpath, paths[variant][0])
+    data = numpy.loadtxt(fname)
+    x = data[:,:-1]
+    y = data[:,-1]
+
+    perm = numpy.random.permutation(ndata)
+ 
+    rval = Dataset()
+    rval.train = Dataset.Obj(x = x[perm[:ntrain],:], y = y[perm[:ntrain]])
+    rval.valid = Dataset.Obj(x = x[perm[ntrain:ntrain+nvalid],:],
+                             y = y[perm[ntrain:ntrain+nvalid]])
+    rval.test  = Dataset.Obj(x = x[perm[:-ntest],:],
+                             y = y[perm[:-ntest]])
+
+    rval.n_classes = 10
+    rval.img_shape = paths[variant][1]
+
+    return rval