Mercurial > pylearn
view make_test_datasets.py @ 505:74b3e65f5f24
added smallNorb dataset, switched to PYLEARN_DATA_ROOT
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 29 Oct 2008 17:09:04 -0400 |
parents | 2d8490d76b3e |
children | 3eb59514b534 |
line wrap: on
line source
import dataset from shapeset.dset import Polygons from linear_regression import linear_predictor from kernel_regression import kernel_predictor from numpy import * """ General-purpose code to generate artificial datasets that can be used to test different learning algorithms. """ def make_triangles_rectangles_online_dataset(image_size=(10,10)): """ Make a binary classification dataset to discriminate triangle images from rectangle images. """ def convert_dataset(dset): # convert the n_vert==3 into target==0 and n_vert==4 into target==1 def mapf(images,n_vertices): n=len(n_vertices) targets = ndarray((n,1),dtype='float64') for i in xrange(n): targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64') return images.reshape(len(images),images[0].size).astype('float64'),targets return dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]) p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) trainset=convert_dataset(p) return trainset def make_triangles_rectangles_dataset(n_examples=600,image_size=(10,10), cache = True): """ Make a binary classification dataset to discriminate triangle images from rectangle images. """ def convert_dataset(dset): # convert the n_vert==3 into target==0 and n_vert==4 into target==1 def mapf(images,n_vertices): n=len(n_vertices) targets = ndarray((n,1),dtype='float64') for i in xrange(n): targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64') return images.reshape(len(images),images[0].size).astype('float64'),targets return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),cache) p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9) data = p.subset[0:n_examples] trainset=convert_dataset(data.subset[0:n_examples]) return trainset def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10), cache = True): """ Make two binary classification datasets to discriminate triangle images from rectangle images. The first one is the training set, the second is the test set. """ data = make_triangles_rectangles_dataset(n_examples=n_examples,image_size=image_size, cache = cache) n_train = int(n_examples*train_frac) trainset=convert_dataset(data.subset[0:n_train]) testset=convert_dataset(data.subset[n_train:n_examples]) return trainset,testset def make_artificial_datasets_from_function(n_inputs=1, n_targets=1, n_examples=20, train_frac=0.5, noise_level=0.1, # add Gaussian noise, noise_level=sigma params_shape=None, f=None, # function computing E[Y|X] otherargs=None, # extra args to f b=None): # force theta[0] with this value """ Make regression data of the form Y | X ~ Normal(f(X,theta,otherargs),noise_level^2) If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval. Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently). The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1). Optionally theta[0] is set to the argument 'b', if b is provided. Return a training set and a test set, by splitting the generated n_examples according to the 'train_frac'tion. """ n_train=int(train_frac*n_examples) n_test=n_examples-n_train if n_inputs==1: delta1=2./n_train delta2=2./n_test inputs = vstack((array(zip(range(n_train)))*delta1-1, 0.5*delta2+array(zip(range(n_test)))*delta2-1)) else: inputs = random.normal(size=(n_examples,n_inputs)) if not f: f = linear_predictor if f==kernel_predictor and not otherargs[1]: otherargs=(otherargs[0],inputs[0:n_train]) if not params_shape: if f==linear_predictor: params_shape = (n_inputs+1,n_targets) elif f==kernel_predictor: params_shape = (otherargs[1].shape[0]+1,n_targets) theta = random.normal(size=params_shape) if params_shape else None if b: theta[0]=b outputs = f(inputs,theta,otherargs) targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets)) # the | stacking creates a strange bug in LookupList constructor: # trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \ # ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)}) # testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \ # ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)}) data = hstack((inputs,targets)) trainset = ArrayDataSet(data[0:n_train], {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) testset = ArrayDataSet(data[n_train:], {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)}) return trainset,testset,theta