Mercurial > pylearn
changeset 737:838646fcf679
Official version of smallNORB, with stereo images of size 96x96
Since the dataset is rather large 800+MB, the NORB_small class will dynamically
load the train/valid or test datasets by overriding the __getattribute__
function.
Any access to "train"/"valid" members, will:
* if "train"/"valid" was not the member which was last accessed...
* load the train data file from memory
* resplit into the same train/valid splits as before
* return reference to train or validation dataset
Any access to the "test" member, will:
* if "test" was not the member which was last accessed...
* load the test data file from memory
* return reference to test dataset
Two consecutive access to train/valid or test will therefore have no access-time
penalty. Alternating between .train and .test will be very costly however.
Before loading the new dataset, the object tries to clean up all of the local
references to the dataset. This is done in the hope that loading the test
dataset will "replace" in memory the "train" dataset (and vice-versa). For this
to be true however, the user should avoid having duplicate references to "train",
"valid" or "test" members (or manually delete them before switching datasets)
TODO: make dynamic loading optional
author | desjagui@atchoum.iro.umontreal.ca |
---|---|
date | Wed, 27 May 2009 21:28:25 -0400 |
parents | 331f35215ea5 |
children | 4c536c570957 |
files | pylearn/datasets/norb_small.py |
diffstat | 1 files changed, 118 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/norb_small.py Wed May 27 21:28:25 2009 -0400 @@ -0,0 +1,118 @@ +import os +import numpy +from ..io.filetensor import read +from .config import data_root +from .dataset import Dataset + +def load_file(info, normalize=True, downsample_amt=1, dtype='float64'): + """ Load the smallNorb data into numpy matrices. + + normalize_pixels True will divide the values by 255, which makes sense in conjunction + with dtype=float32 or dtype=float64. + + """ + # NotImplementedError: subtensor access not written yet + #subt = [numpy.arange(self.dim[0]), + #numpy.arange(0,self.dim[1],downsample_amt), + #numpy.arange(0,self.dim[2],downsample_amt)] + + dat = read(open(info['dat'])) + if downsample_amt != 1: + dat = dat[:, :, ::downsample_amt, ::downsample_amt] + if dtype != 'int8': + dat = numpy.asarray(dat, dtype=dtype) + if normalize: + dat *= (1.0 / 255.0) + + labels = read(open(info['cat'])) + + return dat, labels + + +class NORB_small(object): + + class Paths(): + dirpath = os.path.join(data_root(), 'norb_small', 'original') + train = {} + test = {} + train['dat'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat') + train['cat'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat') + test['dat'] = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat') + test['cat'] = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat') + path = Paths() + + def __init__(self, ntrain=19440, nvalid=4860, ntest=24300, + downsample_amt=1, seed=1, normalize=True, + mode='stereo', dtype='float64'): + + self.n_classes = 5 + self.nsamples = 24300 + self.img_shape = (2,96,96) if mode=='stereo' else (96,96) + + self.ntrain = ntrain + self.nvalid = nvalid + self.ntest = ntest + self.downsample_amt = 1 + self.normalize = normalize + self.dtype = dtype + + rng = numpy.random.RandomState(seed) + self.indices = rng.permutation(self.nsamples) + self.itr = self.indices[0:ntrain] + self.ival = self.indices[ntrain:ntrain+nvalid] + self.current = None + + def load(self, dataset='train'): + + if dataset == 'train' or dataset=='valid': + print 'accessing train or valid dataset' + + if self.current != 'train': + if self.current: del self.dat1 + + print 'need to reload from train file' + dat, cat = load_file(self.path.train, self.normalize, + self.downsample_amt, self.dtype) + + x = dat[self.itr,...].reshape(self.ntrain,-1) + y = cat[self.itr] + self.dat1 = Dataset.Obj(x=x, y=y) # training + + x = dat[self.ival,...].reshape(self.nvalid,-1) + y = cat[self.ival] + self.dat2 = Dataset.Obj(x=x, y=y) # validation + + del dat, cat, x, y + + rval = self.dat1 if dataset=='train' else self.dat2 + self.current = 'train' + + elif dataset=='test': + + print 'retrieving test set' + if self.current!='test': + if self.current: del self.dat1, self.dat2 + + print 'need to reload from test file' + dat, cat = load_file(self.path.test, self.normalize, + self.downsample_amt, self.dtype) + + x = dat.reshape(self.nsamples,-1) + y = cat + self.dat1 = Dataset.Obj(x=x, y=y) + + del dat, cat, x, y + + rval = self.dat1 + self.current = 'test' + else: + raise ValueError("Expected one of [train|valid|test]") + + return rval + + def __getattribute__(self, name): + if name in ('train','valid','test'): + print 'hello' + return object.__getattribute__(self, 'load')(name) + else: + return object.__getattribute__(self, name)