view make_test_datasets.py @ 496:f13847478c6d

A few more ideas, in comments
author Joseph Turian <turian@gmail.com>
date Tue, 28 Oct 2008 12:09:49 -0400
parents 2d8490d76b3e
children 3eb59514b534
line wrap: on
line source

import dataset
from shapeset.dset import Polygons
from linear_regression import linear_predictor
from kernel_regression import kernel_predictor
from numpy import *

"""
General-purpose code to generate artificial datasets that can be used
to test different learning algorithms.
"""


def make_triangles_rectangles_online_dataset(image_size=(10,10)):
    """
    Make a binary classification dataset to discriminate triangle images from rectangle images.
    """
    def convert_dataset(dset):
        # convert the n_vert==3 into target==0 and n_vert==4 into target==1
        def mapf(images,n_vertices):
            n=len(n_vertices)
            targets = ndarray((n,1),dtype='float64')
            for i in xrange(n):
                targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64')
            return images.reshape(len(images),images[0].size).astype('float64'),targets
        return dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"])
  
    p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9)
    trainset=convert_dataset(p)
    return trainset


def make_triangles_rectangles_dataset(n_examples=600,image_size=(10,10), cache = True):
    """
    Make a binary classification dataset to discriminate triangle images from rectangle images.
    """
    def convert_dataset(dset):
        # convert the n_vert==3 into target==0 and n_vert==4 into target==1
        def mapf(images,n_vertices):
            n=len(n_vertices)
            targets = ndarray((n,1),dtype='float64')
            for i in xrange(n):
                targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64')
            return images.reshape(len(images),images[0].size).astype('float64'),targets
        return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),cache)
  
    p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9)
    data = p.subset[0:n_examples]
    trainset=convert_dataset(data.subset[0:n_examples])
    return trainset


def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10), cache = True):
    """
    Make two binary classification datasets to discriminate triangle images from rectangle images.
    The first one is the training set, the second is the test set.
    """
    data = make_triangles_rectangles_dataset(n_examples=n_examples,image_size=image_size, cache = cache)
    n_train = int(n_examples*train_frac)
    trainset=convert_dataset(data.subset[0:n_train])
    testset=convert_dataset(data.subset[n_train:n_examples])
    return trainset,testset


def make_artificial_datasets_from_function(n_inputs=1,
                                           n_targets=1,
                                           n_examples=20,
                                           train_frac=0.5,
                                           noise_level=0.1, # add Gaussian noise, noise_level=sigma
                                           params_shape=None,
                                           f=None, # function computing E[Y|X]
                                           otherargs=None, # extra args to f
                                           b=None): # force theta[0] with this value
    """
    Make regression data of the form
      Y | X ~ Normal(f(X,theta,otherargs),noise_level^2)
    If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval.
    Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently).
    The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1).
    Optionally theta[0] is set to the argument 'b', if b is provided.

    Return a training set and a test set, by splitting the generated n_examples
    according to the 'train_frac'tion.
    """
    n_train=int(train_frac*n_examples)
    n_test=n_examples-n_train
    if n_inputs==1:
        delta1=2./n_train
        delta2=2./n_test
        inputs = vstack((array(zip(range(n_train)))*delta1-1,
                         0.5*delta2+array(zip(range(n_test)))*delta2-1))
    else:
        inputs = random.normal(size=(n_examples,n_inputs))
    if not f:
        f = linear_predictor
    if f==kernel_predictor and not otherargs[1]:
        otherargs=(otherargs[0],inputs[0:n_train])
    if not params_shape:
        if f==linear_predictor:
            params_shape = (n_inputs+1,n_targets)
        elif f==kernel_predictor:
            params_shape = (otherargs[1].shape[0]+1,n_targets)
    theta = random.normal(size=params_shape) if params_shape else None
    if b:
        theta[0]=b
    outputs = f(inputs,theta,otherargs)
    targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets))
    # the | stacking creates a strange bug in LookupList constructor:  
    #  trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \
    #             ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)}) 
    #  testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \
    #            ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)})
    data = hstack((inputs,targets))
    trainset = ArrayDataSet(data[0:n_train],
                            {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)})
    testset = ArrayDataSet(data[n_train:],
                           {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)})
    return trainset,testset,theta