changeset 854:62df19a86359

* added option for loading norb_small in mono instead of stereo (old mod) * added miniblocks.py dataset
author desjagui@opale.iro.umontreal.ca
date Mon, 09 Nov 2009 14:15:19 -0500
parents 972303bef0bf
children 553bf0861fb5
files pylearn/datasets/miniblocks.py pylearn/datasets/norb_small.py
diffstat 2 files changed, 62 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/miniblocks.py	Mon Nov 09 14:15:19 2009 -0500
@@ -0,0 +1,55 @@
+# Interface to miniblocks dataset.
+
+import herding, numpy
+import herding.dataset
+
+from pylearn.datasets import Dataset
+
+def miniblocks(reweight=None):
+    # If 'reweight' is not None, then it is an integer N such that each
+    # sample is duplicated k times, with k taken uniformly in {1, 2, ..., N}.
+    # Some adjustment is made to ensure the dataset size is a multiple of its
+    # original size.
+    data = herding.dataset.Miniblocks(4, batchsize = -1, forever = False,
+            zeroone = True)
+
+    input, target = iter(data).next()
+
+    #from plearn.pyext import pl
+    #data = pl.AutoVMatrix(filename='/u/delallea/LisaPLearn/UserExp/delallea/perso/gen_compare/1DBall_12.amat').getMat()
+    #data = pl.AutoVMatrix(filename='/home/fringant2/lisa/delallea/python_modules/LeDeepNet/mnist_binarized.pmat').getMat()
+    #input = data
+
+    # Note that the target being returned seems to be a dummy target. So
+    # instead, we fill it with zeros.
+    target = numpy.zeros((len(input), 1))
+
+    if reweight is not None:
+        assert isinstance(reweight, int)
+        rgen = numpy.random.RandomState(1827)
+        weights = rgen.randint(1, reweight + 1, size = len(input))
+        new_length = numpy.sum(weights)
+        while new_length % len(input) > 0:
+            to_prune = rgen.randint(len(input))
+            if weights[to_prune] > 1:
+                weights[to_prune] -= 1
+                new_length -= 1
+        assert new_length == numpy.sum(weights)
+        new_input = numpy.zeros((new_length, input.shape[1]))
+        new_target = numpy.zeros((new_length, target.shape[1]))
+        idx = 0
+        for w, i, t in zip(weights, input, target):
+            for k in range(w):
+                new_input[idx, :] = i
+                new_target[idx, :] = t
+                idx += 1
+        input = new_input
+        target = new_target
+        print 'Dataset size after reweighting: %s' % (input.shape, )
+
+    set = Dataset()
+    set.train = Dataset.Obj(x = input, y = target)
+    set.test = Dataset.Obj(x = input, y = target)
+
+    return set
+
--- a/pylearn/datasets/norb_small.py	Tue Nov 03 10:32:51 2009 -0500
+++ b/pylearn/datasets/norb_small.py	Mon Nov 09 14:15:19 2009 -0500
@@ -4,13 +4,14 @@
 from .config import data_root
 from .dataset import Dataset
 
-def load_file(info, normalize=True, downsample_amt=1, dtype='float64'):
+def load_file(info, normalize=True, mode='stereo', downsample_amt=1, dtype='float64'):
     """ Load the smallNorb data into numpy matrices.
 
     normalize_pixels True will divide the values by 255, which makes sense in conjunction
     with dtype=float32 or dtype=float64.
 
     """
+    assert mode in ('stereo','mono')
     # NotImplementedError: subtensor access not written yet
     #subt = [numpy.arange(self.dim[0]), 
             #numpy.arange(0,self.dim[1],downsample_amt),
@@ -23,6 +24,8 @@
         dat = numpy.asarray(dat, dtype=dtype)
     if normalize:
         dat  *= (1.0 / 255.0)
+    if mode == 'mono':
+        dat = dat[:,0,:,:]
 
     labels  = read(open(info['cat']))
 
@@ -77,6 +80,7 @@
         self.n_classes = 5
         self.nsamples = 24300
         self.img_shape = (2,96,96) if mode=='stereo' else (96,96)
+        self.mode = mode
 
         self.ntrain = ntrain
         self.nvalid = nvalid
@@ -132,7 +136,7 @@
 
                 print 'need to reload from train file'
                 dat, cat  = load_file(self.path.train, self.normalize,
-                                      self.downsample_amt, self.dtype)
+                                      self.mode, self.downsample_amt, self.dtype)
 
                 x = dat[self.itr,...].reshape(self.ntrain,-1)
                 y = cat[self.itr]
@@ -155,7 +159,7 @@
 
                 print 'need to reload from test file'
                 dat, cat = load_file(self.path.test, self.normalize,
-                                     self.downsample_amt, self.dtype)
+                                     self.mode, self.downsample_amt, self.dtype)
 
                 x = dat.reshape(self.nsamples,-1)
                 y = cat