# HG changeset patch # User Joseph Turian # Date 1226646563 18000 # Node ID 82bafb80ba653b1270803579a65ed996cff2d9a0 # Parent b267a8000f92388cd1bbc8471bbdde53f6f751e7# Parent 4aa7f74ea93fb404122c8889b11045244939b80e merge diff -r b267a8000f92 -r 82bafb80ba65 datasets/__init__.py --- a/datasets/__init__.py Fri Nov 14 02:07:20 2008 -0500 +++ b/datasets/__init__.py Fri Nov 14 02:09:23 2008 -0500 @@ -0,0 +1,1 @@ +from dataset import dataset, Dataset diff -r b267a8000f92 -r 82bafb80ba65 datasets/dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/dataset.py Fri Nov 14 02:09:23 2008 -0500 @@ -0,0 +1,118 @@ +"""The dataset-from-descriptor mechanism.""" + +_factory = {} + +def add_dataset_factory(tok0, fn): + """Add `fn` as the handler for descriptors whose first token is `tok0`. + + :returns: None + + """ + if tok0 in _factory: + raise Exception('Identifier already in use:', tok0) + else: + _factory[tok0] = fn + +def dataset_factory(tok0): + """Register a function as the handler for a given kind of dataset, identified by `tok0`. + + When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1), + then the handler registered for 'kind_of_dataset' will be called with the same arguments as + dataset_from_descr. + + .. code-block:: python + + @dataset_factory('MNIST') + def mnist_related_dataset(descr, **kwargs): + ... + + :returns: `dectorator` + """ + def decorator(fn): + add_dataset_factory(tok0, fn) + return fn + return decorator + +def dataset(descr, **kwargs): + """Return the dataset described by `descr`. + + :param descr: a dataset identifier + :type descr: str + :returns: `Dataset` + + """ + tok0 = descr.split()[0] + fn = _factory[tok0] + return fn(descr, **kwargs) + + +class Dataset(object): + """Dataset is a generic container for pylearn datasets. + + It is not intended to put any restriction whatsoever on its contents. + + It is intended to encourage certain conventions, described below. Conventions should arise + naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then + describe it here and make it more official. + + If no particular convention applies. Create your own object to store the dataset, and + assign it to the `data` attribute. + """ + data = None + + """ + SIMPLE REGRESSION / CLASSIFICATION + ---------------------------------- + + In this setting, you are aiming to do vector classification or vector regression + where your train, valid and test sets fit in memory. + The convention is to put your data into numpy ndarray instances. Put training data in the + `train` attribute, validation data in the `valid` attribute and test data in the `test + attribute`. + Each of those attributes should be an instance that defines at least two attributes: `x` for the + input matrix and `y` for the target matrix. The `x` ndarray should be one example per + leading index (row for matrices). + The `y` ndarray should be one target per leading index (entry for vectors, row for matrices). + If `y` is a classification target, than it should be a vector with numpy dtype 'int32'. + + If there are weights associated with different examples, then create a 'weights' attribute whose + value is a vector with one floating-point value (typically double-precision) per example. + + If the task is classification, then the classes should be mapped to the integers + 0,1,...,N-1. + The number of classes (here, N) should be stored in the `n_classes` attribute. + + """ + train = None #instance with .x, .y + + valid = None #instance with .x, .y + + test = None #instance with .x, .y + + n_classes = None #int + + """ + WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES + ------------------------------------------- + + In this setting we typically encode images as vectors, by enumerating the pixel values in + left-to-right, top-to-bottom order. Pixel values should be in floating-point, and + normalized between 0 and 1. + + The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows, + cols). + + """ + + img_shape = None # (rows, cols) + + + """ + TIMESERIES + ---------- + + When dealing with examples which are themselves timeseries, put each example timeseries in a + tensor and make a list of them. Generally use tensors, and resort to lists or arrays + wherever different + """ + diff -r b267a8000f92 -r 82bafb80ba65 external/wrap_libsvm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/external/wrap_libsvm.py Fri Nov 14 02:09:23 2008 -0500 @@ -0,0 +1,99 @@ +"""Run an experiment using libsvm. +""" +import numpy +from ..datasets import dataset_from_descr + +# libsvm currently has no python installation instructions/convention. +# +# This module uses a specific convention for libsvm's installation. +# I base this on installing libsvm-2.88. +# To install libsvm's python module, do three things: +# 1. Build libsvm (run make in both the root dir and the python subdir). +# 2. touch a '__init__.py' file in the python subdir +# 3. add a symbolic link to a PYTHONPATH location that looks like this: +# libsvm -> /libsvm-2.88/python/ +# +# That is the sort of thing that this module expects from 'import libsvm' + +import libsvm + +def score_01(x, y, model): + assert len(x) == len(y) + size = len(x) + errors = 0 + for i in range(size): + prediction = model.predict(x[i]) + #probability = model.predict_probability + if (y[i] != prediction): + errors = errors + 1 + return float(errors)/size + +#this is the dbdict experiment interface... if you happen to use dbdict +class State(object): + #TODO: parametrize to get all the kernel types, not hardcode for RBF + dataset = 'MNIST_1k' + C = 10.0 + kernel = 'RBF' + # rel_gamma is related to the procedure Jerome used. He mentioned why in + # quadratic_neurons/neuropaper/draft3.pdf. + rel_gamma = 1.0 + + def __init__(self, **kwargs): + for k, v in kwargs: + setattr(self, k, type(getattr(self, k))(v)) + + +def dbdict_run_svm_experiment(state, channel=lambda *args, **kwargs:None): + """Parameters are described in state, and returned in state. + + :param state: object instance to store parameters and return values + :param channel: not used + + :returns: None + + This is the kind of function that dbdict-run can use. + + """ + ((train_x, train_y), (valid_x, valid_y), (test_x, test_y)) = dataset_from_descr(state.dataset) + + #libsvm needs stuff in int32 on a 32bit machine + #TODO: test this on a 64bit machine + train_y = numpy.asarray(train_y, dtype='int32') + valid_y = numpy.asarray(valid_y, dtype='int32') + test_y = numpy.asarray(test_y, dtype='int32') + problem = svm.svm_problem(train_y, train_x); + + gamma0 = 0.5 / numpy.sum(numpy.var(train_x, axis=0)) + + param = svm.svm_parameter(C=state.C, + kernel_type=getattr(svm, state.kernel), + gamma=state.rel_gamma * gamma0) + + model = svm.svm_model(problem, param) #this is the expensive part + + state.train_01 = score_01(train_x, train_y, model) + state.valid_01 = score_01(valid_x, valid_y, model) + state.test_01 = score_01(test_x, test_y, model) + + state.n_train = len(train_y) + state.n_valid = len(valid_y) + state.n_test = len(test_y) + +def run_svm_experiment(**kwargs): + """Python-friendly interface to dbdict_run_svm_experiment + + Parameters are used to construct a `State` instance, which is returned after running + `dbdict_run_svm_experiment` on it. + + .. code-block:: python + results = run_svm_experiment(dataset='MNIST_1k', C=100.0, rel_gamma=0.01) + print results.n_train + # 1000 + print results.valid_01, results.test_01 + # 0.14, 0.10 #.. or something... + + """ + state = State(**kwargs) + state_run_svm_experiment(state) + return state +