changeset 859:3a68b6936303

merge
author Pascal Lamblin <lamblinp@iro.umontreal.ca>
date Thu, 12 Nov 2009 12:24:36 -0500
parents f11881a265ee (current diff) bd7d540db70d (diff)
children 5c7374bd127c
files
diffstat 8 files changed, 226 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/logistic_regression.py	Thu Nov 12 12:24:05 2009 -0500
+++ b/pylearn/algorithms/logistic_regression.py	Thu Nov 12 12:24:36 2009 -0500
@@ -101,7 +101,7 @@
                     nnet.crossentropy_softmax_max_and_argmax_1hot(
                     self.linear_output, self.target)
 
-        self.unregularized_cost = T.sum(self._xent)
+        self.unregularized_cost = T.mean(self._xent)
         self.l1_cost = self.l1 * T.sum(abs(self.w))
         self.l2_cost = self.l2 * T.sum(self.w**2)
         self.regularized_cost = self.unregularized_cost + self.l1_cost + self.l2_cost
@@ -245,12 +245,12 @@
 
         output = nnet.sigmoid(T.dot(self.x, self.w) + self.b)
         xent = -self.targ * T.log(output) - (1.0 - self.targ) * T.log(1.0 - output)
-        sum_xent = T.sum(xent)
+        mean_xent = T.mean(xent)
 
         self.output = output
         self.xent = xent
-        self.sum_xent = sum_xent
-        self.cost = sum_xent
+        self.mean_xent = mean_xent
+        self.cost = mean_xent
 
         #define the apply method
         self.pred = (T.dot(self.input, self.w) + self.b) > 0.0
@@ -258,8 +258,8 @@
 
         #if this module has any internal parameters, define an update function for them
         if self.params:
-            gparams = T.grad(sum_xent, self.params)
-            self.update = module.Method([self.input, self.targ], sum_xent,
+            gparams = T.grad(mean_xent, self.params)
+            self.update = module.Method([self.input, self.targ], mean_xent,
                                         updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gparams)))
 
 
--- a/pylearn/algorithms/regressor.py	Thu Nov 12 12:24:05 2009 -0500
+++ b/pylearn/algorithms/regressor.py	Thu Nov 12 12:24:36 2009 -0500
@@ -16,7 +16,7 @@
         if input:
           self.input = input
         else:
-          self.target = target
+          self.input = T.dmatrix('input')
 
         if target:
           self.target = target
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/cifar10.py	Thu Nov 12 12:24:36 2009 -0500
@@ -0,0 +1,61 @@
+"""
+Various routines to load/access MNIST data.
+"""
+from __future__ import absolute_import
+
+import os
+import numpy
+import cPickle
+
+from pylearn.datasets.config import data_root # config
+from pylearn.datasets.dataset import Dataset
+
+def unpickle(file):
+    path = os.path.join(data_root(), 'cifar10', 'cifar-10-batches-py')
+    fname = os.path.join(path, file)
+    print 'loading file %s' % fname
+    fo = open(fname, 'rb')
+    dict = cPickle.load(fo)
+    fo.close()
+    return dict
+
+class cifar10():
+
+    def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000):
+        assert ntrain + nvalid <= 50000
+        assert ntest <= 10000
+
+        self.img_shape = (3,32,32)
+        self.img_size = numpy.prod(self.img_shape)
+        self.n_classes = 10
+
+        lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000
+        x = numpy.zeros((lenx,self.img_size), dtype=dtype)
+        y = numpy.zeros(lenx, dtype=dtype)
+        
+        fnames = ['data_batch_%i'%i for i in range(1,6)]
+
+        # load train and validation data
+        nloaded = 0
+        for i, fname in enumerate(fnames):
+            data = unpickle(fname)
+            x[i*10000:(i+1)*10000, :] = data['data']
+            y[i*10000:(i+1)*10000] = data['labels']
+
+            nloaded += 10000
+            if nloaded >= ntrain + nvalid + ntest: break;
+        
+        self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain])
+        self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid],
+                                 y=y[ntrain:ntrain+nvalid])
+       
+        # load test data
+        data = unpickle('test_batch')
+        self.test = Dataset.Obj(x=data['data'][0:ntest],
+                                y=data['labels'][0:ntest])
+
+    def preprocess(self, x):
+        return numpy.float64( x *1.0 / 255.0)
+
+def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200):
+    return cifar10(dtype, ntrain, nvalid, ntest)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/faces.py	Thu Nov 12 12:24:36 2009 -0500
@@ -0,0 +1,45 @@
+"""
+Various routines to load/access faces datasets.
+"""
+from __future__ import absolute_import
+
+import os
+import numpy
+import pylab as pl
+from .config import data_root # config
+from .dataset import Dataset
+
+def att(path=None, randomize=True, normalize=True):
+    path = os.path.join(data_root(), 'faces','att','orl_faces')\
+           if path is None else path
+    
+    h, w = 112, 92
+    nsubjects = 40
+    npics = 10
+
+    x = numpy.zeros((nsubjects * npics, h * w))
+    y = numpy.zeros(nsubjects * npics)
+
+    for sid in range(nsubjects):
+        sdir = os.path.join(path, 's%i'%(sid+1))
+        for n in range(npics):
+            img = pl.imread(os.path.join(sdir,'%i.pgm'%(n+1)))
+            x[sid*npics + n,:] = img[::-1,:].flatten()
+            y[sid*npics + n] = sid
+
+    if normalize:
+        x *= (1.0 / 255.0)
+
+    perm = numpy.random.permutation(len(x))
+
+    rval = Dataset()
+    rval.n_classes = nsubjects
+    rval.img_shape = (112,92)
+    rval.train = Dataset.Obj(x=x[perm,:], y=y[perm])
+
+    # Not sure how well dataset lends itself to classification (only 400 images!)
+    # therefore not too sure it makes sense to have a train/test split
+    rval.valid = None
+    rval.test = None
+
+    return rval
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/image_patches.py	Thu Nov 12 12:24:36 2009 -0500
@@ -0,0 +1,42 @@
+"""
+Routines to load/access Olshausen's image_patches
+"""
+
+import os
+import numpy
+
+from .config import data_root
+from .dataset import Dataset
+
+dirpath = os.path.join(data_root(), 'image_patches','olshausen','smaller_patches')
+
+paths = {'20by20_whiten_01': ('natural_images_patches_whiten.amat',(20,20)),
+         '12by12_whiten_01': ('natural_images_patches_whiten_12_by_12_0_1.amat',(12,12))}
+
+def load_dataset(ntrain=70000, nvalid=15000, ntest=15000, variant='20by20_whiten_01'):
+    
+    ndata = 100000
+
+    if not paths.get(variant, None):
+        raise ValueError('Unknown image_patches variant: %s' % variant)
+    if ntrain + nvalid + ntest < ndata:
+        raise ValueError('ntrain + nvalid + ntest must be smaller than %i' %ndata)
+
+    fname = os.path.join(dirpath, paths[variant][0])
+    data = numpy.loadtxt(fname)
+    x = data[:,:-1]
+    y = data[:,-1]
+
+    perm = numpy.random.permutation(ndata)
+ 
+    rval = Dataset()
+    rval.train = Dataset.Obj(x = x[perm[:ntrain],:], y = y[perm[:ntrain]])
+    rval.valid = Dataset.Obj(x = x[perm[ntrain:ntrain+nvalid],:],
+                             y = y[perm[ntrain:ntrain+nvalid]])
+    rval.test  = Dataset.Obj(x = x[perm[:-ntest],:],
+                             y = y[perm[:-ntest]])
+
+    rval.n_classes = 10
+    rval.img_shape = paths[variant][1]
+
+    return rval
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/miniblocks.py	Thu Nov 12 12:24:36 2009 -0500
@@ -0,0 +1,55 @@
+# Interface to miniblocks dataset.
+
+import herding, numpy
+import herding.dataset
+
+from pylearn.datasets import Dataset
+
+def miniblocks(reweight=None):
+    # If 'reweight' is not None, then it is an integer N such that each
+    # sample is duplicated k times, with k taken uniformly in {1, 2, ..., N}.
+    # Some adjustment is made to ensure the dataset size is a multiple of its
+    # original size.
+    data = herding.dataset.Miniblocks(4, batchsize = -1, forever = False,
+            zeroone = True)
+
+    input, target = iter(data).next()
+
+    #from plearn.pyext import pl
+    #data = pl.AutoVMatrix(filename='/u/delallea/LisaPLearn/UserExp/delallea/perso/gen_compare/1DBall_12.amat').getMat()
+    #data = pl.AutoVMatrix(filename='/home/fringant2/lisa/delallea/python_modules/LeDeepNet/mnist_binarized.pmat').getMat()
+    #input = data
+
+    # Note that the target being returned seems to be a dummy target. So
+    # instead, we fill it with zeros.
+    target = numpy.zeros((len(input), 1))
+
+    if reweight is not None:
+        assert isinstance(reweight, int)
+        rgen = numpy.random.RandomState(1827)
+        weights = rgen.randint(1, reweight + 1, size = len(input))
+        new_length = numpy.sum(weights)
+        while new_length % len(input) > 0:
+            to_prune = rgen.randint(len(input))
+            if weights[to_prune] > 1:
+                weights[to_prune] -= 1
+                new_length -= 1
+        assert new_length == numpy.sum(weights)
+        new_input = numpy.zeros((new_length, input.shape[1]))
+        new_target = numpy.zeros((new_length, target.shape[1]))
+        idx = 0
+        for w, i, t in zip(weights, input, target):
+            for k in range(w):
+                new_input[idx, :] = i
+                new_target[idx, :] = t
+                idx += 1
+        input = new_input
+        target = new_target
+        print 'Dataset size after reweighting: %s' % (input.shape, )
+
+    set = Dataset()
+    set.train = Dataset.Obj(x = input, y = target)
+    set.test = Dataset.Obj(x = input, y = target)
+
+    return set
+
--- a/pylearn/datasets/norb_small.py	Thu Nov 12 12:24:05 2009 -0500
+++ b/pylearn/datasets/norb_small.py	Thu Nov 12 12:24:36 2009 -0500
@@ -4,13 +4,14 @@
 from .config import data_root
 from .dataset import Dataset
 
-def load_file(info, normalize=True, downsample_amt=1, dtype='float64'):
+def load_file(info, normalize=True, mode='stereo', downsample_amt=1, dtype='float64'):
     """ Load the smallNorb data into numpy matrices.
 
     normalize_pixels True will divide the values by 255, which makes sense in conjunction
     with dtype=float32 or dtype=float64.
 
     """
+    assert mode in ('stereo','mono')
     # NotImplementedError: subtensor access not written yet
     #subt = [numpy.arange(self.dim[0]), 
             #numpy.arange(0,self.dim[1],downsample_amt),
@@ -23,6 +24,8 @@
         dat = numpy.asarray(dat, dtype=dtype)
     if normalize:
         dat  *= (1.0 / 255.0)
+    if mode == 'mono':
+        dat = dat[:,0,:,:]
 
     labels  = read(open(info['cat']))
 
@@ -77,6 +80,7 @@
         self.n_classes = 5
         self.nsamples = 24300
         self.img_shape = (2,96,96) if mode=='stereo' else (96,96)
+        self.mode = mode
 
         self.ntrain = ntrain
         self.nvalid = nvalid
@@ -132,7 +136,7 @@
 
                 print 'need to reload from train file'
                 dat, cat  = load_file(self.path.train, self.normalize,
-                                      self.downsample_amt, self.dtype)
+                                      self.mode, self.downsample_amt, self.dtype)
 
                 x = dat[self.itr,...].reshape(self.ntrain,-1)
                 y = cat[self.itr]
@@ -155,7 +159,7 @@
 
                 print 'need to reload from test file'
                 dat, cat = load_file(self.path.test, self.normalize,
-                                     self.downsample_amt, self.dtype)
+                                     self.mode, self.downsample_amt, self.dtype)
 
                 x = dat.reshape(self.nsamples,-1)
                 y = cat
--- a/pylearn/sandbox/scan_inputs_groups.py	Thu Nov 12 12:24:05 2009 -0500
+++ b/pylearn/sandbox/scan_inputs_groups.py	Thu Nov 12 12:24:36 2009 -0500
@@ -691,7 +691,7 @@
                 ]
 
     def c_headers(self):
-        return ['"Python.h"', '"numpy/noprefix.h"', '<math.h>']
+        return ['"Python.h"', '"numpy/noprefix.h"', '<math.h>', '<sstream>']
 
     def c_support_code(self):
         return """                      
@@ -776,7 +776,12 @@
   }
 }
 
-assert(input->nd==value->nd==mask->nd);
+if(input->nd!=value->nd || input->nd!=mask->nd){
+   PyErr_Format(PyExc_ValueError,
+   "FillMissing input have %%d dims, the mask have %%d dims and the value have %%d dims. They should all be equals \\n",
+   input->nd, value->nd, mask->nd);
+   %(fail)s;
+}
 #if %(self.fill_with_is_array)s
   if(input->nd==1){
     %(type)s* value_  = (%(type)s*)(value->data);
@@ -809,7 +814,7 @@
     }
   }else{//not implemented!
 //SHOULD not happen as c_code should revert to the python  version in that case
-    std:stringstream temp;
+    std::stringstream temp;
     temp << "In FillMissing, we try to fill with an array and the input ndim is implemented only for 1 and 2. This case is not implemented."<<endl;
     temp << " ndim="<<input->nd<<endl;;
     std::string param = temp.str();
@@ -866,7 +871,7 @@
     }
   }else{//not implemented!
 //SHOULD not happen as c_code should revert to the python  version in that case
-    std:stringstream temp;
+    std::stringstream temp;
     temp << "In FillMissing, we try to fill with a constant and the input ndim is implemented only for 1, 2 and 3.";
     temp << " ndim="<<input->nd<<endl;;
     std::string param = temp.str();