Mercurial > pylearn

--- a/datasets/__init__.py	Fri Nov 14 02:07:20 2008 -0500
+++ b/datasets/__init__.py	Fri Nov 14 02:09:23 2008 -0500
@@ -0,0 +1,1 @@
+from dataset import dataset, Dataset
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets/dataset.py	Fri Nov 14 02:09:23 2008 -0500
@@ -0,0 +1,118 @@
+"""The dataset-from-descriptor mechanism."""
+
+_factory = {}
+
+def add_dataset_factory(tok0, fn):
+    """Add `fn` as the handler for descriptors whose first token is `tok0`.
+
+    :returns: None
+
+    """
+    if tok0 in _factory:
+        raise Exception('Identifier already in use:', tok0)
+    else:
+        _factory[tok0] = fn
+
+def dataset_factory(tok0):
+    """Register a function as the handler for a given kind of dataset, identified by `tok0`.
+
+    When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
+    then the handler registered for 'kind_of_dataset' will be called with the same arguments as
+    dataset_from_descr.
+
+    .. code-block:: python
+
+        @dataset_factory('MNIST')
+        def mnist_related_dataset(descr, **kwargs):
+            ...
+
+    :returns: `dectorator`
+    """
+    def decorator(fn):
+        add_dataset_factory(tok0, fn)
+        return fn
+    return decorator
+
+def dataset(descr, **kwargs):
+    """Return the dataset described by `descr`.
+
+    :param descr: a dataset identifier
+    :type descr: str
+    :returns: `Dataset`
+
+    """
+    tok0 = descr.split()[0]
+    fn = _factory[tok0]
+    return fn(descr, **kwargs)
+
+
+class Dataset(object):
+    """Dataset is a generic container for pylearn datasets.
+
+    It is not intended to put any restriction whatsoever on its contents.
+
+    It is intended to encourage certain conventions, described below.  Conventions should arise
+    naturally among datasets in PyLearn.  When a few datasets adhere to a new convention, then
+    describe it here and make it more official.
+
+    If no particular convention applies.  Create your own object to store the dataset, and
+    assign it to the `data` attribute.
+    """
+    data = None
+
+    """
+    SIMPLE REGRESSION / CLASSIFICATION
+    ----------------------------------
+
+    In this setting, you are aiming to do vector classification or vector regression
+    where your train, valid and test sets fit in memory.
+    The convention is to put your data into numpy ndarray instances.  Put training data in the
+    `train` attribute,  validation data in the `valid` attribute and test data in the `test
+    attribute`.
+    Each of those attributes should be an instance that defines at least two attributes: `x` for the
+    input matrix and `y` for the target matrix.  The `x` ndarray should be one example per
+    leading index (row for matrices).
+    The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
+    If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
+
+    If there are weights associated with different examples, then create a 'weights' attribute whose
+    value is a vector with one floating-point value (typically double-precision) per example.
+
+    If the task is classification, then the classes should be mapped to the integers
+    0,1,...,N-1.
+    The number of classes (here, N) should be stored in the `n_classes` attribute.
+
+    """
+    train = None #instance with .x, .y
+
+    valid = None #instance with .x, .y
+
+    test = None #instance with .x, .y
+
+    n_classes = None  #int
+
+    """
+    WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
+    -------------------------------------------
+
+    In this setting we typically encode images as vectors, by enumerating the pixel values in
+    left-to-right, top-to-bottom order.  Pixel values should be in floating-point, and
+    normalized between 0 and 1.
+
+    The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
+    cols).
+
+    """
+
+    img_shape = None # (rows, cols)
+
+
+    """
+    TIMESERIES
+    ----------
+
+    When dealing with examples which are themselves timeseries, put each example timeseries in a
+    tensor and make a list of them.  Generally use tensors, and resort to lists or arrays
+    wherever different
+    """
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/external/wrap_libsvm.py	Fri Nov 14 02:09:23 2008 -0500
@@ -0,0 +1,99 @@
+"""Run an experiment using libsvm.
+"""
+import numpy
+from ..datasets import dataset_from_descr
+
+# libsvm currently has no python installation instructions/convention.
+#
+# This module uses a specific convention for libsvm's installation.
+# I base this on installing libsvm-2.88.
+# To install libsvm's python module, do three things:
+# 1. Build libsvm (run make in both the root dir and the python subdir).
+# 2. touch a '__init__.py' file in the python subdir
+# 3. add a symbolic link to a PYTHONPATH location that looks like this:
+#    libsvm -> <your root path>/libsvm-2.88/python/
+#
+# That is the sort of thing that this module expects from 'import libsvm'
+
+import libsvm
+
+def score_01(x, y, model):
+    assert len(x) == len(y)
+    size = len(x)
+    errors = 0
+    for i in range(size):
+        prediction = model.predict(x[i])
+        #probability = model.predict_probability
+        if (y[i] != prediction):
+            errors = errors + 1
+    return float(errors)/size
+
+#this is the dbdict experiment interface... if you happen to use dbdict
+class State(object):
+    #TODO: parametrize to get all the kernel types, not hardcode for RBF
+    dataset = 'MNIST_1k'
+    C = 10.0
+    kernel = 'RBF'
+    # rel_gamma is related to the procedure Jerome used. He mentioned why in
+    # quadratic_neurons/neuropaper/draft3.pdf.
+    rel_gamma = 1.0
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs:
+            setattr(self, k, type(getattr(self, k))(v))
+
+
+def dbdict_run_svm_experiment(state, channel=lambda *args, **kwargs:None):
+    """Parameters are described in state, and returned in state.
+
+    :param state: object instance to store parameters and return values
+    :param channel: not used
+
+    :returns: None
+
+    This is the kind of function that dbdict-run can use.
+
+    """
+    ((train_x, train_y), (valid_x, valid_y), (test_x, test_y)) = dataset_from_descr(state.dataset)
+
+    #libsvm needs stuff in int32 on a 32bit machine
+    #TODO: test this on a 64bit machine
+    train_y = numpy.asarray(train_y, dtype='int32')
+    valid_y = numpy.asarray(valid_y, dtype='int32')
+    test_y = numpy.asarray(test_y, dtype='int32')
+    problem = svm.svm_problem(train_y, train_x);
+
+    gamma0 = 0.5 / numpy.sum(numpy.var(train_x, axis=0))
+
+    param = svm.svm_parameter(C=state.C,
+            kernel_type=getattr(svm, state.kernel),
+            gamma=state.rel_gamma * gamma0)
+
+    model = svm.svm_model(problem, param) #this is the expensive part
+
+    state.train_01 = score_01(train_x, train_y, model)
+    state.valid_01 = score_01(valid_x, valid_y, model)
+    state.test_01 = score_01(test_x, test_y, model)
+
+    state.n_train = len(train_y)
+    state.n_valid = len(valid_y)
+    state.n_test = len(test_y)
+
+def run_svm_experiment(**kwargs):
+    """Python-friendly interface to dbdict_run_svm_experiment
+
+    Parameters are used to construct a `State` instance, which is returned after running
+    `dbdict_run_svm_experiment` on it.
+
+    .. code-block:: python
+        results = run_svm_experiment(dataset='MNIST_1k', C=100.0, rel_gamma=0.01)
+        print results.n_train
+        # 1000
+        print results.valid_01, results.test_01
+        # 0.14, 0.10  #.. or something...
+
+    """
+    state = State(**kwargs)
+    state_run_svm_experiment(state)
+    return state
+