changeset 755:6a703c5f2391

Prepare the way for the new train/valid split, based on object instances.
author Pascal Lamblin <lamblinp@iro.umontreal.ca>
date Tue, 02 Jun 2009 21:15:21 -0400
parents 84d22b7d835a
children 8447bc9bb2d4
files pylearn/datasets/norb_small.py
diffstat 1 files changed, 25 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/norb_small.py	Tue Jun 02 13:51:22 2009 -0400
+++ b/pylearn/datasets/norb_small.py	Tue Jun 02 21:15:21 2009 -0400
@@ -63,11 +63,14 @@
         test = {}
         train['dat'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat')
         train['cat'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat')
+        train['info'] = os.path.join(dirpath, 'smallnorb-5x46789x9x18x6x2x96x96-training-info.mat')
         test['dat']  = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat')
         test['cat']  = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat')
+        test['info']  = os.path.join(dirpath, 'smallnorb-5x01235x9x18x6x2x96x96-testing-info.mat')
     path = Paths()
 
-    def __init__(self, ntrain=19440, nvalid=4860, ntest=24300, 
+    def __init__(self, ntrain=19440, nvalid=4860, ntest=24300,
+               valid_variant=None,
                downsample_amt=1, seed=1, normalize=False,
                mode='stereo', dtype='int8'):
 
@@ -83,11 +86,26 @@
         self.dtype = dtype
 
         rng = numpy.random.RandomState(seed)
-        self.indices = rng.permutation(self.nsamples)
-        self.itr  = self.indices[0:ntrain]
-        self.ival = self.indices[ntrain:ntrain+nvalid]
+        if valid_variant is None:
+            # The validation set is just a random subset of training
+            self.indices = rng.permutation(self.nsamples)
+            self.itr  = self.indices[0:ntrain]
+            self.ival = self.indices[ntrain:ntrain+nvalid]
+        elif valid_variant in (4,6,7,8,9):
+            # The validation set consists in an instance of each category
+            # In order to know which indices correspond to which instance,
+            # we need to load the 'info' files.
+            train_info = read(open(train['info']))
+
+            ordered_itrain = numpy.nonzero(train_info[:,0] != valid_variant)[0]
+            ordered_ivalid = numpy.nonzero(train_info[:,0] == valid_variant)[0]
+
+            # TODO: randomize
+            self.itr = ordered_itrain
+            self.ival = ordered_ivalid
+
         self.current = None
- 
+
     def load(self, dataset='train'):
 
         if dataset == 'train' or dataset=='valid':
@@ -99,7 +117,7 @@
                 print 'need to reload from train file'
                 dat, cat  = load_file(self.path.train, self.normalize,
                                       self.downsample_amt, self.dtype)
-                
+
                 x = dat[self.itr,...].reshape(self.ntrain,-1)
                 y = cat[self.itr]
                 self.dat1 = Dataset.Obj(x=x, y=y) # training
@@ -126,7 +144,7 @@
                 x = dat.reshape(self.nsamples,-1)
                 y = cat
                 self.dat1 = Dataset.Obj(x=x, y=y)
-                
+
                 del dat, cat, x, y
 
             rval = self.dat1