changeset 431:0f8c81b0776d

Adding file make_test_datasets to host simple data-generating processes to create artificial datasets meant to test various learning algorithms.
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Tue, 29 Jul 2008 10:19:25 -0400
parents c096e2820131
children 8e4d2ebd816a
files kernel_regression.py linear_regression.py make_test_datasets.py
diffstat 3 files changed, 91 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/kernel_regression.py	Tue Jul 29 09:36:09 2008 -0400
+++ b/kernel_regression.py	Tue Jul 29 10:19:25 2008 -0400
@@ -225,3 +225,7 @@
             return ds
         
 
+def kernel_predictor(inputs,params,*otherargs):
+  p = KernelPredictor(params,*otherargs[0])
+  return p.compute_outputs(inputs)
+  
--- a/linear_regression.py	Tue Jul 29 09:36:09 2008 -0400
+++ b/linear_regression.py	Tue Jul 29 10:19:25 2008 -0400
@@ -181,6 +181,10 @@
             return ds
         
 
+def linear_predictor(inputs,params,*otherargs):
+  p = LinearPredictor(params)
+  return p.compute_outputs(inputs)
+
 #TODO : an online version
 class OnlineLinearRegression(OnlineLearningAlgorithm):
     """
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_test_datasets.py	Tue Jul 29 10:19:25 2008 -0400
@@ -0,0 +1,83 @@
+from pylearn.dataset import ArrayDataSet
+
+"""
+General-purpose code to generate artificial datasets that can be used
+to test different learning algorithms.
+"""
+
+def make_triangles_rectangles_datasets(n_examples=600,train_frac=0.5,image_size=(10,10)):
+    """
+    Make a binary classification dataset to discriminate triangle images from rectangle images.
+    """
+    def convert_dataset(dset):
+        # convert the n_vert==3 into target==0 and n_vert==4 into target==1
+        def mapf(images,n_vertices):
+            n=len(n_vertices)
+            targets = ndarray((n,1),dtype='float64')
+            for i in xrange(n):
+                targets[i,0] = array([0. if vertices[i]==3 else 1.],dtype='float64')
+            return images.reshape(len(images),images[0].size).astype('float64'),targets
+        return dataset.CachedDataSet(dataset.ApplyFunctionDataSet(dset("image","nvert"),mapf,["input","target"]),True)
+  
+    p=Polygons(image_size,[3,4],fg_min=1./255,fg_max=1./255,rot_max=1.,scale_min=0.35,scale_max=0.9,pos_min=0.1, pos_max=0.9)
+    data = p.subset[0:n_examples]
+    save_polygon_data(data,"shapes")
+    n_train=int(n_examples*train_frac)
+    trainset=convert_dataset(data.subset[0:n_train])
+    testset=convert_dataset(data.subset[n_train:n_examples])
+    return trainset,testset
+
+def make_artificial_datasets_from_function(n_inputs=1,
+                                           n_targets=1,
+                                           n_examples=20,
+                                           train_frac=0.5,
+                                           noise_level=0.1, # add Gaussian noise, noise_level=sigma
+                                           params_shape=None,
+                                           f=None, # function computing E[Y|X]
+                                           otherargs=None, # extra args to f
+                                           b=None): # force theta[0] with this value
+    """
+    Make regression data of the form
+      Y | X ~ Normal(f(X,theta,otherargs),noise_level^2)
+    If n_inputs==1 then X is chosen at regular locations on the [-1,1] interval.
+    Otherwise X is sampled according to a Normal(0,1) on all dimensions (independently).
+    The parameters theta is a matrix of shape params_shape that is sampled from Normal(0,1).
+    Optionally theta[0] is set to the argument 'b', if b is provided.
+
+    Return a training set and a test set, by splitting the generated n_examples
+    according to the 'train_frac'tion.
+    """
+  n_train=int(train_frac*n_examples)
+  n_test=n_examples-n_train
+  if n_inputs==1:
+    delta1=2./n_train
+    delta2=2./n_test
+    inputs = vstack((array(zip(range(n_train)))*delta1-1,
+                     0.5*delta2+array(zip(range(n_test)))*delta2-1))
+  else:
+    inputs = random.normal(size=(n_examples,n_inputs))
+  if not f:
+    f = linear_predictor
+  if f==kernel_predictor and not otherargs[1]:
+    otherargs=(otherargs[0],inputs[0:n_train])
+  if not params_shape:
+    if f==linear_predictor:
+      params_shape = (n_inputs+1,n_targets)
+    elif f==kernel_predictor:
+      params_shape = (otherargs[1].shape[0]+1,n_targets)
+  theta = random.normal(size=params_shape) if params_shape else None
+  if b:
+    theta[0]=b
+  outputs = f(inputs,theta,otherargs)
+  targets = outputs + random.normal(scale=noise_level,size=(n_examples,n_targets))
+# the | stacking creates a strange bug in LookupList constructor:  
+#  trainset = ArrayDataSet(inputs[0:n_examples/2],{'input':slice(0,n_inputs)}) | \
+#             ArrayDataSet(targets[0:n_examples/2],{'target':slice(0,n_targets)}) 
+#  testset = ArrayDataSet(inputs[n_examples/2:],{'input':slice(0,n_inputs)}) | \
+#            ArrayDataSet(targets[n_examples/2:],{'target':slice(0,n_targets)})
+  data = hstack((inputs,targets))
+  trainset = ArrayDataSet(data[0:n_train],
+                          {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)})
+  testset = ArrayDataSet(data[n_train:],
+                          {'input':slice(0,n_inputs),'target':slice(n_inputs,n_inputs+n_targets)})
+  return trainset,testset,theta