changeset 869:6298876b2b01

author James Bergstra <>
date Mon, 09 Nov 2009 14:55:08 -0500
parents 553bf0861fb5 (diff) 22e3a422653d (current diff)
children 2fffbfa41920
diffstat 7 files changed, 220 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/	Tue Nov 03 15:26:25 2009 -0500
+++ b/pylearn/algorithms/	Mon Nov 09 14:55:08 2009 -0500
@@ -16,7 +16,7 @@
         if input:
           self.input = input
- = target
+          self.input = T.dmatrix('input')
         if target:
  = target
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/	Mon Nov 09 14:55:08 2009 -0500
@@ -0,0 +1,61 @@
+Various routines to load/access MNIST data.
+from __future__ import absolute_import
+import os
+import numpy
+import cPickle
+from pylearn.datasets.config import data_root # config
+from pylearn.datasets.dataset import Dataset
+def unpickle(file):
+    path = os.path.join(data_root(), 'cifar10', 'cifar-10-batches-py')
+    fname = os.path.join(path, file)
+    print 'loading file %s' % fname
+    fo = open(fname, 'rb')
+    dict = cPickle.load(fo)
+    fo.close()
+    return dict
+class cifar10():
+    def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000):
+        assert ntrain + nvalid <= 50000
+        assert ntest <= 10000
+        self.img_shape = (3,32,32)
+        self.img_size =
+        self.n_classes = 10
+        lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000
+        x = numpy.zeros((lenx,self.img_size), dtype=dtype)
+        y = numpy.zeros(lenx, dtype=dtype)
+        fnames = ['data_batch_%i'%i for i in range(1,6)]
+        # load train and validation data
+        nloaded = 0
+        for i, fname in enumerate(fnames):
+            data = unpickle(fname)
+            x[i*10000:(i+1)*10000, :] = data['data']
+            y[i*10000:(i+1)*10000] = data['labels']
+            nloaded += 10000
+            if nloaded >= ntrain + nvalid + ntest: break;
+        self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain])
+        self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid],
+                                 y=y[ntrain:ntrain+nvalid])
+        # load test data
+        data = unpickle('test_batch')
+        self.test = Dataset.Obj(x=data['data'][0:ntest],
+                                y=data['labels'][0:ntest])
+    def preprocess(self, x):
+        return numpy.float64( x *1.0 / 255.0)
+def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200):
+    return cifar10(dtype, ntrain, nvalid, ntest)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/	Mon Nov 09 14:55:08 2009 -0500
@@ -0,0 +1,45 @@
+Various routines to load/access faces datasets.
+from __future__ import absolute_import
+import os
+import numpy
+import pylab as pl
+from .config import data_root # config
+from .dataset import Dataset
+def att(path=None, randomize=True, normalize=True):
+    path = os.path.join(data_root(), 'faces','att','orl_faces')\
+           if path is None else path
+    h, w = 112, 92
+    nsubjects = 40
+    npics = 10
+    x = numpy.zeros((nsubjects * npics, h * w))
+    y = numpy.zeros(nsubjects * npics)
+    for sid in range(nsubjects):
+        sdir = os.path.join(path, 's%i'%(sid+1))
+        for n in range(npics):
+            img = pl.imread(os.path.join(sdir,'%i.pgm'%(n+1)))
+            x[sid*npics + n,:] = img[::-1,:].flatten()
+            y[sid*npics + n] = sid
+    if normalize:
+        x *= (1.0 / 255.0)
+    perm = numpy.random.permutation(len(x))
+    rval = Dataset()
+    rval.n_classes = nsubjects
+    rval.img_shape = (112,92)
+    rval.train = Dataset.Obj(x=x[perm,:], y=y[perm])
+    # Not sure how well dataset lends itself to classification (only 400 images!)
+    # therefore not too sure it makes sense to have a train/test split
+    rval.valid = None
+    rval.test = None
+    return rval
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/	Mon Nov 09 14:55:08 2009 -0500
@@ -0,0 +1,42 @@
+Routines to load/access Olshausen's image_patches
+import os
+import numpy
+from .config import data_root
+from .dataset import Dataset
+dirpath = os.path.join(data_root(), 'image_patches','olshausen','smaller_patches')
+paths = {'20by20_whiten_01': ('natural_images_patches_whiten.amat',(20,20)),
+         '12by12_whiten_01': ('natural_images_patches_whiten_12_by_12_0_1.amat',(12,12))}
+def load_dataset(ntrain=70000, nvalid=15000, ntest=15000, variant='20by20_whiten_01'):
+    ndata = 100000
+    if not paths.get(variant, None):
+        raise ValueError('Unknown image_patches variant: %s' % variant)
+    if ntrain + nvalid + ntest < ndata:
+        raise ValueError('ntrain + nvalid + ntest must be smaller than %i' %ndata)
+    fname = os.path.join(dirpath, paths[variant][0])
+    data = numpy.loadtxt(fname)
+    x = data[:,:-1]
+    y = data[:,-1]
+    perm = numpy.random.permutation(ndata)
+    rval = Dataset()
+    rval.train = Dataset.Obj(x = x[perm[:ntrain],:], y = y[perm[:ntrain]])
+    rval.valid = Dataset.Obj(x = x[perm[ntrain:ntrain+nvalid],:],
+                             y = y[perm[ntrain:ntrain+nvalid]])
+    rval.test  = Dataset.Obj(x = x[perm[:-ntest],:],
+                             y = y[perm[:-ntest]])
+    rval.n_classes = 10
+    rval.img_shape = paths[variant][1]
+    return rval
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/	Mon Nov 09 14:55:08 2009 -0500
@@ -0,0 +1,55 @@
+# Interface to miniblocks dataset.
+import herding, numpy
+import herding.dataset
+from pylearn.datasets import Dataset
+def miniblocks(reweight=None):
+    # If 'reweight' is not None, then it is an integer N such that each
+    # sample is duplicated k times, with k taken uniformly in {1, 2, ..., N}.
+    # Some adjustment is made to ensure the dataset size is a multiple of its
+    # original size.
+    data = herding.dataset.Miniblocks(4, batchsize = -1, forever = False,
+            zeroone = True)
+    input, target = iter(data).next()
+    #from plearn.pyext import pl
+    #data = pl.AutoVMatrix(filename='/u/delallea/LisaPLearn/UserExp/delallea/perso/gen_compare/1DBall_12.amat').getMat()
+    #data = pl.AutoVMatrix(filename='/home/fringant2/lisa/delallea/python_modules/LeDeepNet/mnist_binarized.pmat').getMat()
+    #input = data
+    # Note that the target being returned seems to be a dummy target. So
+    # instead, we fill it with zeros.
+    target = numpy.zeros((len(input), 1))
+    if reweight is not None:
+        assert isinstance(reweight, int)
+        rgen = numpy.random.RandomState(1827)
+        weights = rgen.randint(1, reweight + 1, size = len(input))
+        new_length = numpy.sum(weights)
+        while new_length % len(input) > 0:
+            to_prune = rgen.randint(len(input))
+            if weights[to_prune] > 1:
+                weights[to_prune] -= 1
+                new_length -= 1
+        assert new_length == numpy.sum(weights)
+        new_input = numpy.zeros((new_length, input.shape[1]))
+        new_target = numpy.zeros((new_length, target.shape[1]))
+        idx = 0
+        for w, i, t in zip(weights, input, target):
+            for k in range(w):
+                new_input[idx, :] = i
+                new_target[idx, :] = t
+                idx += 1
+        input = new_input
+        target = new_target
+        print 'Dataset size after reweighting: %s' % (input.shape, )
+    set = Dataset()
+    set.train = Dataset.Obj(x = input, y = target)
+    set.test = Dataset.Obj(x = input, y = target)
+    return set
--- a/pylearn/datasets/	Tue Nov 03 15:26:25 2009 -0500
+++ b/pylearn/datasets/	Mon Nov 09 14:55:08 2009 -0500
@@ -4,13 +4,14 @@
 from .config import data_root
 from .dataset import Dataset
-def load_file(info, normalize=True, downsample_amt=1, dtype='float64'):
+def load_file(info, normalize=True, mode='stereo', downsample_amt=1, dtype='float64'):
     """ Load the smallNorb data into numpy matrices.
     normalize_pixels True will divide the values by 255, which makes sense in conjunction
     with dtype=float32 or dtype=float64.
+    assert mode in ('stereo','mono')
     # NotImplementedError: subtensor access not written yet
     #subt = [numpy.arange(self.dim[0]), 
@@ -23,6 +24,8 @@
         dat = numpy.asarray(dat, dtype=dtype)
     if normalize:
         dat  *= (1.0 / 255.0)
+    if mode == 'mono':
+        dat = dat[:,0,:,:]
     labels  = read(open(info['cat']))
@@ -77,6 +80,7 @@
         self.n_classes = 5
         self.nsamples = 24300
         self.img_shape = (2,96,96) if mode=='stereo' else (96,96)
+        self.mode = mode
         self.ntrain = ntrain
         self.nvalid = nvalid
@@ -132,7 +136,7 @@
                 print 'need to reload from train file'
                 dat, cat  = load_file(self.path.train, self.normalize,
-                                      self.downsample_amt, self.dtype)
+                                      self.mode, self.downsample_amt, self.dtype)
                 x = dat[self.itr,...].reshape(self.ntrain,-1)
                 y = cat[self.itr]
@@ -155,7 +159,7 @@
                 print 'need to reload from test file'
                 dat, cat = load_file(self.path.test, self.normalize,
-                                     self.downsample_amt, self.dtype)
+                                     self.mode, self.downsample_amt, self.dtype)
                 x = dat.reshape(self.nsamples,-1)
                 y = cat
--- a/pylearn/sandbox/	Tue Nov 03 15:26:25 2009 -0500
+++ b/pylearn/sandbox/	Mon Nov 09 14:55:08 2009 -0500
@@ -691,7 +691,7 @@
     def c_headers(self):
-        return ['"Python.h"', '"numpy/noprefix.h"', '<math.h>']
+        return ['"Python.h"', '"numpy/noprefix.h"', '<math.h>', '<sstream>']
     def c_support_code(self):
         return """                      
@@ -776,7 +776,12 @@
+if(input->nd!=value->nd || input->nd!=mask->nd){
+   PyErr_Format(PyExc_ValueError,
+   "FillMissing input have %%d dims, the mask have %%d dims and the value have %%d dims. They should all be equals \\n",
+   input->nd, value->nd, mask->nd);
+   %(fail)s;
 #if %(self.fill_with_is_array)s
     %(type)s* value_  = (%(type)s*)(value->data);
@@ -809,7 +814,7 @@
   }else{//not implemented!
 //SHOULD not happen as c_code should revert to the python  version in that case
-    std:stringstream temp;
+    std::stringstream temp;
     temp << "In FillMissing, we try to fill with an array and the input ndim is implemented only for 1 and 2. This case is not implemented."<<endl;
     temp << " ndim="<<input->nd<<endl;;
     std::string param = temp.str();
@@ -866,7 +871,7 @@
   }else{//not implemented!
 //SHOULD not happen as c_code should revert to the python  version in that case
-    std:stringstream temp;
+    std::stringstream temp;
     temp << "In FillMissing, we try to fill with a constant and the input ndim is implemented only for 1, 2 and 3.";
     temp << " ndim="<<input->nd<<endl;;
     std::string param = temp.str();