changeset 737:838646fcf679

Official version of smallNORB, with stereo images of size 96x96 Since the dataset is rather large 800+MB, the NORB_small class will dynamically load the train/valid or test datasets by overriding the __getattribute__ function. Any access to "train"/"valid" members, will: * if "train"/"valid" was not the member which was last accessed... * load the train data file from memory * resplit into the same train/valid splits as before * return reference to train or validation dataset Any access to the "test" member, will: * if "test" was not the member which was last accessed... * load the test data file from memory * return reference to test dataset Two consecutive access to train/valid or test will therefore have no access-time penalty. Alternating between .train and .test will be very costly however. Before loading the new dataset, the object tries to clean up all of the local references to the dataset. This is done in the hope that loading the test dataset will "replace" in memory the "train" dataset (and vice-versa). For this to be true however, the user should avoid having duplicate references to "train", "valid" or "test" members (or manually delete them before switching datasets) TODO: make dynamic loading optional
author desjagui@atchoum.iro.umontreal.ca
date Wed, 27 May 2009 21:28:25 -0400
parents 331f35215ea5
children 4c536c570957
files pylearn/datasets/norb_small.py
diffstat 1 files changed, 118 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/norb_small.py	Wed May 27 21:28:25 2009 -0400
@@ -0,0 +1,118 @@
+import os
+import numpy
+from ..io.filetensor import read
+from .config import data_root
+from .dataset import Dataset
+
+def load_file(info, normalize=True, downsample_amt=1, dtype='float64'):
+    """ Load the smallNorb data into numpy matrices.
+
+    normalize_pixels True will divide the values by 255, which makes sense in conjunction
+    with dtype=float32 or dtype=float64.
+
+    """
+    # NotImplementedError: subtensor access not written yet
+    #subt = [numpy.arange(self.dim[0]), 
+            #numpy.arange(0,self.dim[1],downsample_amt),
+            #numpy.arange(0,self.dim[2],downsample_amt)]
+
+    dat = read(open(info['dat']))
+    if downsample_amt != 1:
+        dat = dat[:, :, ::downsample_amt, ::downsample_amt]
+    if dtype != 'int8':
+        dat = numpy.asarray(dat, dtype=dtype)
+    if normalize:
+        dat  *= (1.0 / 255.0)
+
+    labels  = read(open(info['cat']))
+
+    return dat, labels
+
+
+class NORB_small(object):
+
+    class Paths():
+        dirpath = os.path.join(data_root(), 'norb_small', 'original')
+        train = {}
+        test = {}
+        train['dat'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat')
+        train['cat'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat')
+        test['dat']  = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat')
+        test['cat']  = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat')
+    path = Paths()
+
+    def __init__(self, ntrain=19440, nvalid=4860, ntest=24300, 
+               downsample_amt=1, seed=1, normalize=True,
+               mode='stereo', dtype='float64'):
+
+        self.n_classes = 5
+        self.nsamples = 24300
+        self.img_shape = (2,96,96) if mode=='stereo' else (96,96)
+
+        self.ntrain = ntrain
+        self.nvalid = nvalid
+        self.ntest = ntest
+        self.downsample_amt = 1
+        self.normalize = normalize
+        self.dtype = dtype
+
+        rng = numpy.random.RandomState(seed)
+        self.indices = rng.permutation(self.nsamples)
+        self.itr  = self.indices[0:ntrain]
+        self.ival = self.indices[ntrain:ntrain+nvalid]
+        self.current = None
+ 
+    def load(self, dataset='train'):
+
+        if dataset == 'train' or dataset=='valid':
+            print 'accessing train or valid dataset'
+
+            if self.current != 'train':
+                if self.current: del self.dat1
+
+                print 'need to reload from train file'
+                dat, cat  = load_file(self.path.train, self.normalize,
+                                      self.downsample_amt, self.dtype)
+                
+                x = dat[self.itr,...].reshape(self.ntrain,-1)
+                y = cat[self.itr]
+                self.dat1 = Dataset.Obj(x=x, y=y) # training
+
+                x = dat[self.ival,...].reshape(self.nvalid,-1)
+                y = cat[self.ival]
+                self.dat2 = Dataset.Obj(x=x, y=y) # validation
+
+                del dat, cat, x, y
+
+            rval = self.dat1 if dataset=='train' else self.dat2 
+            self.current = 'train'
+
+        elif dataset=='test':
+
+            print 'retrieving test set'
+            if self.current!='test':
+                if self.current: del self.dat1, self.dat2
+
+                print 'need to reload from test file'
+                dat, cat = load_file(self.path.test, self.normalize,
+                                     self.downsample_amt, self.dtype)
+
+                x = dat.reshape(self.nsamples,-1)
+                y = cat
+                self.dat1 = Dataset.Obj(x=x, y=y)
+                
+                del dat, cat, x, y
+
+            rval = self.dat1
+            self.current = 'test'
+        else:
+            raise ValueError("Expected one of [train|valid|test]")
+
+        return rval
+
+    def __getattribute__(self, name):
+        if name in ('train','valid','test'):
+            print 'hello'
+            return object.__getattribute__(self, 'load')(name)
+        else:
+            return object.__getattribute__(self, name)