# HG changeset patch # User Pascal Lamblin # Date 1258046676 18000 # Node ID 3a68b6936303f994854792efe2d85a7631b42998 # Parent f11881a265ee22afb7ba9db62548e86aeea5d63b# Parent bd7d540db70d990a03b1961875c5a6baa0ccd226 merge diff -r f11881a265ee -r 3a68b6936303 pylearn/algorithms/logistic_regression.py --- a/pylearn/algorithms/logistic_regression.py Thu Nov 12 12:24:05 2009 -0500 +++ b/pylearn/algorithms/logistic_regression.py Thu Nov 12 12:24:36 2009 -0500 @@ -101,7 +101,7 @@ nnet.crossentropy_softmax_max_and_argmax_1hot( self.linear_output, self.target) - self.unregularized_cost = T.sum(self._xent) + self.unregularized_cost = T.mean(self._xent) self.l1_cost = self.l1 * T.sum(abs(self.w)) self.l2_cost = self.l2 * T.sum(self.w**2) self.regularized_cost = self.unregularized_cost + self.l1_cost + self.l2_cost @@ -245,12 +245,12 @@ output = nnet.sigmoid(T.dot(self.x, self.w) + self.b) xent = -self.targ * T.log(output) - (1.0 - self.targ) * T.log(1.0 - output) - sum_xent = T.sum(xent) + mean_xent = T.mean(xent) self.output = output self.xent = xent - self.sum_xent = sum_xent - self.cost = sum_xent + self.mean_xent = mean_xent + self.cost = mean_xent #define the apply method self.pred = (T.dot(self.input, self.w) + self.b) > 0.0 @@ -258,8 +258,8 @@ #if this module has any internal parameters, define an update function for them if self.params: - gparams = T.grad(sum_xent, self.params) - self.update = module.Method([self.input, self.targ], sum_xent, + gparams = T.grad(mean_xent, self.params) + self.update = module.Method([self.input, self.targ], mean_xent, updates = dict((p, p - self.lr * g) for p, g in zip(self.params, gparams))) diff -r f11881a265ee -r 3a68b6936303 pylearn/algorithms/regressor.py --- a/pylearn/algorithms/regressor.py Thu Nov 12 12:24:05 2009 -0500 +++ b/pylearn/algorithms/regressor.py Thu Nov 12 12:24:36 2009 -0500 @@ -16,7 +16,7 @@ if input: self.input = input else: - self.target = target + self.input = T.dmatrix('input') if target: self.target = target diff -r f11881a265ee -r 3a68b6936303 pylearn/datasets/cifar10.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/cifar10.py Thu Nov 12 12:24:36 2009 -0500 @@ -0,0 +1,61 @@ +""" +Various routines to load/access MNIST data. +""" +from __future__ import absolute_import + +import os +import numpy +import cPickle + +from pylearn.datasets.config import data_root # config +from pylearn.datasets.dataset import Dataset + +def unpickle(file): + path = os.path.join(data_root(), 'cifar10', 'cifar-10-batches-py') + fname = os.path.join(path, file) + print 'loading file %s' % fname + fo = open(fname, 'rb') + dict = cPickle.load(fo) + fo.close() + return dict + +class cifar10(): + + def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000): + assert ntrain + nvalid <= 50000 + assert ntest <= 10000 + + self.img_shape = (3,32,32) + self.img_size = numpy.prod(self.img_shape) + self.n_classes = 10 + + lenx = numpy.ceil((ntrain + nvalid) / 10000.)*10000 + x = numpy.zeros((lenx,self.img_size), dtype=dtype) + y = numpy.zeros(lenx, dtype=dtype) + + fnames = ['data_batch_%i'%i for i in range(1,6)] + + # load train and validation data + nloaded = 0 + for i, fname in enumerate(fnames): + data = unpickle(fname) + x[i*10000:(i+1)*10000, :] = data['data'] + y[i*10000:(i+1)*10000] = data['labels'] + + nloaded += 10000 + if nloaded >= ntrain + nvalid + ntest: break; + + self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain]) + self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid], + y=y[ntrain:ntrain+nvalid]) + + # load test data + data = unpickle('test_batch') + self.test = Dataset.Obj(x=data['data'][0:ntest], + y=data['labels'][0:ntest]) + + def preprocess(self, x): + return numpy.float64( x *1.0 / 255.0) + +def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200): + return cifar10(dtype, ntrain, nvalid, ntest) diff -r f11881a265ee -r 3a68b6936303 pylearn/datasets/faces.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/faces.py Thu Nov 12 12:24:36 2009 -0500 @@ -0,0 +1,45 @@ +""" +Various routines to load/access faces datasets. +""" +from __future__ import absolute_import + +import os +import numpy +import pylab as pl +from .config import data_root # config +from .dataset import Dataset + +def att(path=None, randomize=True, normalize=True): + path = os.path.join(data_root(), 'faces','att','orl_faces')\ + if path is None else path + + h, w = 112, 92 + nsubjects = 40 + npics = 10 + + x = numpy.zeros((nsubjects * npics, h * w)) + y = numpy.zeros(nsubjects * npics) + + for sid in range(nsubjects): + sdir = os.path.join(path, 's%i'%(sid+1)) + for n in range(npics): + img = pl.imread(os.path.join(sdir,'%i.pgm'%(n+1))) + x[sid*npics + n,:] = img[::-1,:].flatten() + y[sid*npics + n] = sid + + if normalize: + x *= (1.0 / 255.0) + + perm = numpy.random.permutation(len(x)) + + rval = Dataset() + rval.n_classes = nsubjects + rval.img_shape = (112,92) + rval.train = Dataset.Obj(x=x[perm,:], y=y[perm]) + + # Not sure how well dataset lends itself to classification (only 400 images!) + # therefore not too sure it makes sense to have a train/test split + rval.valid = None + rval.test = None + + return rval diff -r f11881a265ee -r 3a68b6936303 pylearn/datasets/image_patches.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/image_patches.py Thu Nov 12 12:24:36 2009 -0500 @@ -0,0 +1,42 @@ +""" +Routines to load/access Olshausen's image_patches +""" + +import os +import numpy + +from .config import data_root +from .dataset import Dataset + +dirpath = os.path.join(data_root(), 'image_patches','olshausen','smaller_patches') + +paths = {'20by20_whiten_01': ('natural_images_patches_whiten.amat',(20,20)), + '12by12_whiten_01': ('natural_images_patches_whiten_12_by_12_0_1.amat',(12,12))} + +def load_dataset(ntrain=70000, nvalid=15000, ntest=15000, variant='20by20_whiten_01'): + + ndata = 100000 + + if not paths.get(variant, None): + raise ValueError('Unknown image_patches variant: %s' % variant) + if ntrain + nvalid + ntest < ndata: + raise ValueError('ntrain + nvalid + ntest must be smaller than %i' %ndata) + + fname = os.path.join(dirpath, paths[variant][0]) + data = numpy.loadtxt(fname) + x = data[:,:-1] + y = data[:,-1] + + perm = numpy.random.permutation(ndata) + + rval = Dataset() + rval.train = Dataset.Obj(x = x[perm[:ntrain],:], y = y[perm[:ntrain]]) + rval.valid = Dataset.Obj(x = x[perm[ntrain:ntrain+nvalid],:], + y = y[perm[ntrain:ntrain+nvalid]]) + rval.test = Dataset.Obj(x = x[perm[:-ntest],:], + y = y[perm[:-ntest]]) + + rval.n_classes = 10 + rval.img_shape = paths[variant][1] + + return rval diff -r f11881a265ee -r 3a68b6936303 pylearn/datasets/miniblocks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/miniblocks.py Thu Nov 12 12:24:36 2009 -0500 @@ -0,0 +1,55 @@ +# Interface to miniblocks dataset. + +import herding, numpy +import herding.dataset + +from pylearn.datasets import Dataset + +def miniblocks(reweight=None): + # If 'reweight' is not None, then it is an integer N such that each + # sample is duplicated k times, with k taken uniformly in {1, 2, ..., N}. + # Some adjustment is made to ensure the dataset size is a multiple of its + # original size. + data = herding.dataset.Miniblocks(4, batchsize = -1, forever = False, + zeroone = True) + + input, target = iter(data).next() + + #from plearn.pyext import pl + #data = pl.AutoVMatrix(filename='/u/delallea/LisaPLearn/UserExp/delallea/perso/gen_compare/1DBall_12.amat').getMat() + #data = pl.AutoVMatrix(filename='/home/fringant2/lisa/delallea/python_modules/LeDeepNet/mnist_binarized.pmat').getMat() + #input = data + + # Note that the target being returned seems to be a dummy target. So + # instead, we fill it with zeros. + target = numpy.zeros((len(input), 1)) + + if reweight is not None: + assert isinstance(reweight, int) + rgen = numpy.random.RandomState(1827) + weights = rgen.randint(1, reweight + 1, size = len(input)) + new_length = numpy.sum(weights) + while new_length % len(input) > 0: + to_prune = rgen.randint(len(input)) + if weights[to_prune] > 1: + weights[to_prune] -= 1 + new_length -= 1 + assert new_length == numpy.sum(weights) + new_input = numpy.zeros((new_length, input.shape[1])) + new_target = numpy.zeros((new_length, target.shape[1])) + idx = 0 + for w, i, t in zip(weights, input, target): + for k in range(w): + new_input[idx, :] = i + new_target[idx, :] = t + idx += 1 + input = new_input + target = new_target + print 'Dataset size after reweighting: %s' % (input.shape, ) + + set = Dataset() + set.train = Dataset.Obj(x = input, y = target) + set.test = Dataset.Obj(x = input, y = target) + + return set + diff -r f11881a265ee -r 3a68b6936303 pylearn/datasets/norb_small.py --- a/pylearn/datasets/norb_small.py Thu Nov 12 12:24:05 2009 -0500 +++ b/pylearn/datasets/norb_small.py Thu Nov 12 12:24:36 2009 -0500 @@ -4,13 +4,14 @@ from .config import data_root from .dataset import Dataset -def load_file(info, normalize=True, downsample_amt=1, dtype='float64'): +def load_file(info, normalize=True, mode='stereo', downsample_amt=1, dtype='float64'): """ Load the smallNorb data into numpy matrices. normalize_pixels True will divide the values by 255, which makes sense in conjunction with dtype=float32 or dtype=float64. """ + assert mode in ('stereo','mono') # NotImplementedError: subtensor access not written yet #subt = [numpy.arange(self.dim[0]), #numpy.arange(0,self.dim[1],downsample_amt), @@ -23,6 +24,8 @@ dat = numpy.asarray(dat, dtype=dtype) if normalize: dat *= (1.0 / 255.0) + if mode == 'mono': + dat = dat[:,0,:,:] labels = read(open(info['cat'])) @@ -77,6 +80,7 @@ self.n_classes = 5 self.nsamples = 24300 self.img_shape = (2,96,96) if mode=='stereo' else (96,96) + self.mode = mode self.ntrain = ntrain self.nvalid = nvalid @@ -132,7 +136,7 @@ print 'need to reload from train file' dat, cat = load_file(self.path.train, self.normalize, - self.downsample_amt, self.dtype) + self.mode, self.downsample_amt, self.dtype) x = dat[self.itr,...].reshape(self.ntrain,-1) y = cat[self.itr] @@ -155,7 +159,7 @@ print 'need to reload from test file' dat, cat = load_file(self.path.test, self.normalize, - self.downsample_amt, self.dtype) + self.mode, self.downsample_amt, self.dtype) x = dat.reshape(self.nsamples,-1) y = cat diff -r f11881a265ee -r 3a68b6936303 pylearn/sandbox/scan_inputs_groups.py --- a/pylearn/sandbox/scan_inputs_groups.py Thu Nov 12 12:24:05 2009 -0500 +++ b/pylearn/sandbox/scan_inputs_groups.py Thu Nov 12 12:24:36 2009 -0500 @@ -691,7 +691,7 @@ ] def c_headers(self): - return ['"Python.h"', '"numpy/noprefix.h"', ''] + return ['"Python.h"', '"numpy/noprefix.h"', '', ''] def c_support_code(self): return """ @@ -776,7 +776,12 @@ } } -assert(input->nd==value->nd==mask->nd); +if(input->nd!=value->nd || input->nd!=mask->nd){ + PyErr_Format(PyExc_ValueError, + "FillMissing input have %%d dims, the mask have %%d dims and the value have %%d dims. They should all be equals \\n", + input->nd, value->nd, mask->nd); + %(fail)s; +} #if %(self.fill_with_is_array)s if(input->nd==1){ %(type)s* value_ = (%(type)s*)(value->data); @@ -809,7 +814,7 @@ } }else{//not implemented! //SHOULD not happen as c_code should revert to the python version in that case - std:stringstream temp; + std::stringstream temp; temp << "In FillMissing, we try to fill with an array and the input ndim is implemented only for 1 and 2. This case is not implemented."<