# HG changeset patch # User desjagui@opale.iro.umontreal.ca # Date 1257795660 18000 # Node ID 553bf0861fb5bd8a67ff1e94c67c01d38dedcd8a # Parent 62df19a86359c4d9ee3d2b9c582dcb9c603b33ff adding simple cifar10 dataset (bypassing dataset_ops), AT&T face dataset and Olshausen's image patches diff -r 62df19a86359 -r 553bf0861fb5 pylearn/datasets/cifar10.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/cifar10.py Mon Nov 09 14:41:00 2009 -0500 @@ -0,0 +1,61 @@ +""" +Various routines to load/access MNIST data. +""" +from __future__ import absolute_import + +import os +import numpy +import cPickle + +from pylearn.datasets.config import data_root # config +from pylearn.datasets.dataset import Dataset + +def unpickle(file): + path = os.path.join(data_root(), 'cifar10', 'cifar-10-batches-py') + fname = os.path.join(path, file) + print 'loading file %s' % fname + fo = open(fname, 'rb') + dict = cPickle.load(fo) + fo.close() + return dict + +class cifar10(): + + def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000): + assert ntrain + nvalid <= 50000 + assert ntest <= 10000 + + self.img_shape = (3,32,32) + self.img_size = numpy.prod(self.img_shape) + self.n_classes = 10 + + lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000 + x = numpy.zeros((lenx,self.img_size), dtype=dtype) + y = numpy.zeros(lenx, dtype=dtype) + + fnames = ['data_batch_%i'%i for i in range(1,6)] + + # load train and validation data + nloaded = 0 + for i, fname in enumerate(fnames): + data = unpickle(fname) + x[i*10000:(i+1)*10000, :] = data['data'] + y[i*10000:(i+1)*10000] = data['labels'] + + nloaded += 10000 + if nloaded >= ntrain + nvalid + ntest: break; + + self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain]) + self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid], + y=y[ntrain:ntrain+nvalid]) + + # load test data + data = unpickle('test_batch') + self.test = Dataset.Obj(x=data['data'][0:ntest], + y=data['labels'][0:ntest]) + + def preprocess(self, x): + return numpy.float64( x *1.0 / 255.0) + +def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200): + return cifar10(dtype, ntrain, nvalid, ntest) diff -r 62df19a86359 -r 553bf0861fb5 pylearn/datasets/faces.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/faces.py Mon Nov 09 14:41:00 2009 -0500 @@ -0,0 +1,45 @@ +""" +Various routines to load/access faces datasets. +""" +from __future__ import absolute_import + +import os +import numpy +import pylab as pl +from .config import data_root # config +from .dataset import Dataset + +def att(path=None, randomize=True, normalize=True): + path = os.path.join(data_root(), 'faces','att','orl_faces')\ + if path is None else path + + h, w = 112, 92 + nsubjects = 40 + npics = 10 + + x = numpy.zeros((nsubjects * npics, h * w)) + y = numpy.zeros(nsubjects * npics) + + for sid in range(nsubjects): + sdir = os.path.join(path, 's%i'%(sid+1)) + for n in range(npics): + img = pl.imread(os.path.join(sdir,'%i.pgm'%(n+1))) + x[sid*npics + n,:] = img[::-1,:].flatten() + y[sid*npics + n] = sid + + if normalize: + x *= (1.0 / 255.0) + + perm = numpy.random.permutation(len(x)) + + rval = Dataset() + rval.n_classes = nsubjects + rval.img_shape = (112,92) + rval.train = Dataset.Obj(x=x[perm,:], y=y[perm]) + + # Not sure how well dataset lends itself to classification (only 400 images!) + # therefore not too sure it makes sense to have a train/test split + rval.valid = None + rval.test = None + + return rval diff -r 62df19a86359 -r 553bf0861fb5 pylearn/datasets/image_patches.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/image_patches.py Mon Nov 09 14:41:00 2009 -0500 @@ -0,0 +1,42 @@ +""" +Routines to load/access Olshausen's image_patches +""" + +import os +import numpy + +from .config import data_root +from .dataset import Dataset + +dirpath = os.path.join(data_root(), 'image_patches','olshausen','smaller_patches') + +paths = {'20by20_whiten_01': ('natural_images_patches_whiten.amat',(20,20)), + '12by12_whiten_01': ('natural_images_patches_whiten_12_by_12_0_1.amat',(12,12))} + +def load_dataset(ntrain=70000, nvalid=15000, ntest=15000, variant='20by20_whiten_01'): + + ndata = 100000 + + if not paths.get(variant, None): + raise ValueError('Unknown image_patches variant: %s' % variant) + if ntrain + nvalid + ntest < ndata: + raise ValueError('ntrain + nvalid + ntest must be smaller than %i' %ndata) + + fname = os.path.join(dirpath, paths[variant][0]) + data = numpy.loadtxt(fname) + x = data[:,:-1] + y = data[:,-1] + + perm = numpy.random.permutation(ndata) + + rval = Dataset() + rval.train = Dataset.Obj(x = x[perm[:ntrain],:], y = y[perm[:ntrain]]) + rval.valid = Dataset.Obj(x = x[perm[ntrain:ntrain+nvalid],:], + y = y[perm[ntrain:ntrain+nvalid]]) + rval.test = Dataset.Obj(x = x[perm[:-ntest],:], + y = y[perm[:-ntest]]) + + rval.n_classes = 10 + rval.img_shape = paths[variant][1] + + return rval