Mercurial > pylearn
changeset 854:62df19a86359
* added option for loading norb_small in mono instead of stereo (old mod)
* added miniblocks.py dataset
author | desjagui@opale.iro.umontreal.ca |
---|---|
date | Mon, 09 Nov 2009 14:15:19 -0500 |
parents | 972303bef0bf |
children | 553bf0861fb5 |
files | pylearn/datasets/miniblocks.py pylearn/datasets/norb_small.py |
diffstat | 2 files changed, 62 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/miniblocks.py Mon Nov 09 14:15:19 2009 -0500 @@ -0,0 +1,55 @@ +# Interface to miniblocks dataset. + +import herding, numpy +import herding.dataset + +from pylearn.datasets import Dataset + +def miniblocks(reweight=None): + # If 'reweight' is not None, then it is an integer N such that each + # sample is duplicated k times, with k taken uniformly in {1, 2, ..., N}. + # Some adjustment is made to ensure the dataset size is a multiple of its + # original size. + data = herding.dataset.Miniblocks(4, batchsize = -1, forever = False, + zeroone = True) + + input, target = iter(data).next() + + #from plearn.pyext import pl + #data = pl.AutoVMatrix(filename='/u/delallea/LisaPLearn/UserExp/delallea/perso/gen_compare/1DBall_12.amat').getMat() + #data = pl.AutoVMatrix(filename='/home/fringant2/lisa/delallea/python_modules/LeDeepNet/mnist_binarized.pmat').getMat() + #input = data + + # Note that the target being returned seems to be a dummy target. So + # instead, we fill it with zeros. + target = numpy.zeros((len(input), 1)) + + if reweight is not None: + assert isinstance(reweight, int) + rgen = numpy.random.RandomState(1827) + weights = rgen.randint(1, reweight + 1, size = len(input)) + new_length = numpy.sum(weights) + while new_length % len(input) > 0: + to_prune = rgen.randint(len(input)) + if weights[to_prune] > 1: + weights[to_prune] -= 1 + new_length -= 1 + assert new_length == numpy.sum(weights) + new_input = numpy.zeros((new_length, input.shape[1])) + new_target = numpy.zeros((new_length, target.shape[1])) + idx = 0 + for w, i, t in zip(weights, input, target): + for k in range(w): + new_input[idx, :] = i + new_target[idx, :] = t + idx += 1 + input = new_input + target = new_target + print 'Dataset size after reweighting: %s' % (input.shape, ) + + set = Dataset() + set.train = Dataset.Obj(x = input, y = target) + set.test = Dataset.Obj(x = input, y = target) + + return set +
--- a/pylearn/datasets/norb_small.py Tue Nov 03 10:32:51 2009 -0500 +++ b/pylearn/datasets/norb_small.py Mon Nov 09 14:15:19 2009 -0500 @@ -4,13 +4,14 @@ from .config import data_root from .dataset import Dataset -def load_file(info, normalize=True, downsample_amt=1, dtype='float64'): +def load_file(info, normalize=True, mode='stereo', downsample_amt=1, dtype='float64'): """ Load the smallNorb data into numpy matrices. normalize_pixels True will divide the values by 255, which makes sense in conjunction with dtype=float32 or dtype=float64. """ + assert mode in ('stereo','mono') # NotImplementedError: subtensor access not written yet #subt = [numpy.arange(self.dim[0]), #numpy.arange(0,self.dim[1],downsample_amt), @@ -23,6 +24,8 @@ dat = numpy.asarray(dat, dtype=dtype) if normalize: dat *= (1.0 / 255.0) + if mode == 'mono': + dat = dat[:,0,:,:] labels = read(open(info['cat'])) @@ -77,6 +80,7 @@ self.n_classes = 5 self.nsamples = 24300 self.img_shape = (2,96,96) if mode=='stereo' else (96,96) + self.mode = mode self.ntrain = ntrain self.nvalid = nvalid @@ -132,7 +136,7 @@ print 'need to reload from train file' dat, cat = load_file(self.path.train, self.normalize, - self.downsample_amt, self.dtype) + self.mode, self.downsample_amt, self.dtype) x = dat[self.itr,...].reshape(self.ntrain,-1) y = cat[self.itr] @@ -155,7 +159,7 @@ print 'need to reload from test file' dat, cat = load_file(self.path.test, self.normalize, - self.downsample_amt, self.dtype) + self.mode, self.downsample_amt, self.dtype) x = dat.reshape(self.nsamples,-1) y = cat