annotate make_test_datasets.py @ 459:f400f62e7f9e

Fixed embedding preprocessing
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:00:10 -0400
parents 2d8490d76b3e
children 3eb59514b534
rev   line source
437
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
1 import dataset
432
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
2 from shapeset.dset import Polygons
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
3 from linear_regression import linear_predictor
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
4 from kernel_regression import kernel_predictor
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
5 from numpy import *
431
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
6
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
7 """
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
8 General-purpose code to generate artificial datasets that can be used
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
9 to test different learning algorithms.
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
10 """
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
11
437
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
12
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
13 def make_triangles_rectangles_online_dataset(image_size=(10,10)):
431
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
14 """
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
15 Make a binary classification dataset to discriminate triangle images from rectangle images.
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
16 """
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
17 def convert_dataset(dset):
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
18 # convert the n_vert==3 into target==0 and n_vert==4 into target==1
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
19 def mapf(images,n_vertices):
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
20 n=len(n_vertices)
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
21 targets = ndarray((n,1),dtype='float64')
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
22 for i in xrange(n):
437
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
23 targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64')
431
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
24 return images.reshape(len(images),images[0].size).astype('float64'),targets
437
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
25 return dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"])
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
26
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
27 p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9)
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
28 trainset=convert_dataset(p)
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
29 return trainset
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
30
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
31
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
32 def make_triangles_rectangles_dataset(n_examples=600,image_size=(10,10), cache = True):
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
33 """
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
34 Make a binary classification dataset to discriminate triangle images from rectangle images.
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
35 """
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
36 def convert_dataset(dset):
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
37 # convert the n_vert==3 into target==0 and n_vert==4 into target==1
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
38 def mapf(images,n_vertices):
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
39 n=len(n_vertices)
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
40 targets = ndarray((n,1),dtype='float64')
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
41 for i in xrange(n):
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
42 targets[i,0] = array([0. if n_vertices[i]==3 else 1.],dtype='float64')
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
43 return images.reshape(len(images),images[0].size).astype('float64'),targets
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
44 return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),cache)
431
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
45
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
46 p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9)
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
47 data = p.subset[0:n_examples]
437
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
48 trainset=convert_dataset(data.subset[0:n_examples])
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
49 return trainset
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
50
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
51
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
52 def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10), cache = True):
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
53 """
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
54 Make two binary classification datasets to discriminate triangle images from rectangle images.
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
55 The first one is the training set, the second is the test set.
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
56 """
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
57 data = make_triangles_rectangles_dataset(n_examples=n_examples,image_size=image_size, cache = cache)
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
58 n_train = int(n_examples*train_frac)
431
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
59 trainset=convert_dataset(data.subset[0:n_train])
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
60 testset=convert_dataset(data.subset[n_train:n_examples])
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
61 return trainset,testset
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
62
437
2d8490d76b3e added two methods to make_test_datasets
Olivier Breuleux <breuleuo@iro.umontreal.ca>
parents: 432
diff changeset
63
431
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
64 def make_artificial_datasets_from_function(n_inputs=1,
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
65 n_targets=1,
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
66 n_examples=20,
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
67 train_frac=0.5,
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
68 noise_level=0.1, # add Gaussian noise, noise_level=sigma
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
69 params_shape=None,
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
70 f=None, # function computing E[Y|X]
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
71 otherargs=None, # extra args to f
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
72 b=None): # force theta[0] with this value
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
73 """
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
74 Make regression data of the form
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
75 Y | X ~ Normal(f(X,theta,otherargs),noise_level^2)
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
76 If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval.
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
77 Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently).
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
78 The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1).
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
79 Optionally theta[0] is set to the argument 'b', if b is provided.
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
80
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
81 Return a training set and a test set, by splitting the generated n_examples
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
82 according to the 'train_frac'tion.
0f8c81b0776d Adding file make_test_datasets to host simple data-generating processes
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents:
diff changeset
83 """
432
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
84 n_train=int(train_frac*n_examples)
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
85 n_test=n_examples-n_train
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
86 if n_inputs==1:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
87 delta1=2./n_train
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
88 delta2=2./n_test
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
89 inputs = vstack((array(zip(range(n_train)))*delta1-1,
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
90 0.5*delta2+array(zip(range(n_test)))*delta2-1))
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
91 else:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
92 inputs = random.normal(size=(n_examples,n_inputs))
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
93 if not f:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
94 f = linear_predictor
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
95 if f==kernel_predictor and not otherargs[1]:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
96 otherargs=(otherargs[0],inputs[0:n_train])
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
97 if not params_shape:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
98 if f==linear_predictor:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
99 params_shape = (n_inputs+1,n_targets)
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
100 elif f==kernel_predictor:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
101 params_shape = (otherargs[1].shape[0]+1,n_targets)
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
102 theta = random.normal(size=params_shape) if params_shape else None
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
103 if b:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
104 theta[0]=b
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
105 outputs = f(inputs,theta,otherargs)
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
106 targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets))
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
107 # the | stacking creates a strange bug in LookupList constructor:
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
108 # trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
109 # ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)})
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
110 # testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
111 # ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)})
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
112 data = hstack((inputs,targets))
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
113 trainset = ArrayDataSet(data[0:n_train],
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
114 {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)})
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
115 testset = ArrayDataSet(data[n_train:],
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
116 {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)})
8e4d2ebd816a added a test for LinearRegression
Yoshua Bengio <bengioy@iro.umontreal.ca>
parents: 431
diff changeset
117 return trainset,testset,theta