Mercurial > pylearn
changeset 431:0f8c81b0776d
Adding file make_test_datasets to host simple data-generating processes
to create artificial datasets meant to test various learning algorithms.
author | Yoshua Bengio <bengioy@iro.umontreal.ca> |
---|---|
date | Tue, 29 Jul 2008 10:19:25 -0400 |
parents | c096e2820131 |
children | 8e4d2ebd816a |
files | kernel_regression.py linear_regression.py make_test_datasets.py |
diffstat | 3 files changed, 91 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/kernel_regression.py Tue Jul 29 09:36:09 2008 -0400 +++ b/kernel_regression.py Tue Jul 29 10:19:25 2008 -0400 @@ -225,3 +225,7 @@ return ds +def kernel_predictor(inputs,params,*otherargs): + p = KernelPredictor(params,*otherargs[0]) + return p.compute_outputs(inputs) +
--- a/linear_regression.py Tue Jul 29 09:36:09 2008 -0400 +++ b/linear_regression.py Tue Jul 29 10:19:25 2008 -0400 @@ -181,6 +181,10 @@ return ds +def linear_predictor(inputs,params,*otherargs): + p = LinearPredictor(params) + return p.compute_outputs(inputs) + #TODO : an online version class OnlineLinearRegression(OnlineLearningAlgorithm): """
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_test_datasets.py Tue Jul 29 10:19:25 2008 -0400 @@ -0,0 +1,83 @@ +from pylearn.dataset import ArrayDataSet + +""" +General-purpose code to generate artificial datasets that can be used +to test different learning algorithms. +""" + +def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10)): + """ + Make a binary classification dataset to discriminate triangle images from rectangle images. + """ + def convert_dataset(dset): + # convert the n_vert==3 into target==0 and n_vert==4 into target==1 + def mapf(images,n_vertices): + n=len(n_vertices) + targets = ndarray((n,1),dtype='float64') + for i in xrange(n): + targets[i,0] = array([0. if vertices[i]==3 else 1.],dtype='float64') + return images.reshape(len(images),images[0].size).astype('float64'),targets + return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),True) + + p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) + data = p.subset[0:n_examples] + save_polygon_data(data,"shapes") + n_train=int(n_examples*train_frac) + trainset=convert_dataset(data.subset[0:n_train]) + testset=convert_dataset(data.subset[n_train:n_examples]) + return trainset,testset + +def make_artificial_datasets_from_function(n_inputs=1, + n_targets=1, + n_examples=20, + train_frac=0.5, + noise_level=0.1, # add Gaussian noise, noise_level=sigma + params_shape=None, + f=None, # function computing E[Y|X] + otherargs=None, # extra args to f + b=None): # force theta[0] with this value + """ + Make regression data of the form + Y | X ~ Normal(f(X,theta,otherargs),noise_level^2) + If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval. + Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently). + The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1). + Optionally theta[0] is set to the argument 'b', if b is provided. + + Return a training set and a test set, by splitting the generated n_examples + according to the 'train_frac'tion. + """ + n_train=int(train_frac*n_examples) + n_test=n_examples-n_train + if n_inputs==1: + delta1=2./n_train + delta2=2./n_test + inputs = vstack((array(zip(range(n_train)))*delta1-1, + 0.5*delta2+array(zip(range(n_test)))*delta2-1)) + else: + inputs = random.normal(size=(n_examples,n_inputs)) + if not f: + f = linear_predictor + if f==kernel_predictor and not otherargs[1]: + otherargs=(otherargs[0],inputs[0:n_train]) + if not params_shape: + if f==linear_predictor: + params_shape = (n_inputs+1,n_targets) + elif f==kernel_predictor: + params_shape = (otherargs[1].shape[0]+1,n_targets) + theta = random.normal(size=params_shape) if params_shape else None + if b: + theta[0]=b + outputs = f(inputs,theta,otherargs) + targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets)) +# the | stacking creates a strange bug in LookupList constructor: +# trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \ +# ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)}) +# testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \ +# ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)}) + data = hstack((inputs,targets)) + trainset = ArrayDataSet(data[0:n_train], + {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) + testset = ArrayDataSet(data[n_train:], + {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) + return trainset,testset,theta