view datasets/dataset.py @ 536:c6563c629984

Moved word preprocessing out
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:11:52 -0500
parents 4aa7f74ea93f
children
line wrap: on
line source

"""The dataset-from-descriptor mechanism."""

_factory = {}

def add_dataset_factory(tok0, fn):
    """Add `fn` as the handler for descriptors whose first token is `tok0`.

    :returns: None

    """
    if tok0 in _factory:
        raise Exception('Identifier already in use:', tok0)
    else:
        _factory[tok0] = fn

def dataset_factory(tok0):
    """Register a function as the handler for a given kind of dataset, identified by `tok0`.

    When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
    then the handler registered for 'kind_of_dataset' will be called with the same arguments as
    dataset_from_descr.

    .. code-block:: python
        
        @dataset_factory('MNIST')
        def mnist_related_dataset(descr, **kwargs):
            ...

    :returns: `dectorator`
    """
    def decorator(fn):
        add_dataset_factory(tok0, fn)
        return fn
    return decorator

def dataset(descr, **kwargs):
    """Return the dataset described by `descr`.

    :param descr: a dataset identifier
    :type descr: str
    :returns: `Dataset`

    """
    tok0 = descr.split()[0]
    fn = _factory[tok0]
    return fn(descr, **kwargs)


class Dataset(object):
    """Dataset is a generic container for pylearn datasets.

    It is not intended to put any restriction whatsoever on its contents.

    It is intended to encourage certain conventions, described below.  Conventions should arise
    naturally among datasets in PyLearn.  When a few datasets adhere to a new convention, then
    describe it here and make it more official.

    If no particular convention applies.  Create your own object to store the dataset, and
    assign it to the `data` attribute.
    """
    data = None

    """
    SIMPLE REGRESSION / CLASSIFICATION
    ----------------------------------

    In this setting, you are aiming to do vector classification or vector regression
    where your train, valid and test sets fit in memory.
    The convention is to put your data into numpy ndarray instances.  Put training data in the
    `train` attribute,  validation data in the `valid` attribute and test data in the `test
    attribute`.
    Each of those attributes should be an instance that defines at least two attributes: `x` for the
    input matrix and `y` for the target matrix.  The `x` ndarray should be one example per
    leading index (row for matrices).
    The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
    If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
    
    If there are weights associated with different examples, then create a 'weights' attribute whose
    value is a vector with one floating-point value (typically double-precision) per example.

    If the task is classification, then the classes should be mapped to the integers
    0,1,...,N-1.
    The number of classes (here, N) should be stored in the `n_classes` attribute.

    """
    train = None #instance with .x, .y

    valid = None #instance with .x, .y

    test = None #instance with .x, .y

    n_classes = None  #int

    """
    WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
    -------------------------------------------

    In this setting we typically encode images as vectors, by enumerating the pixel values in
    left-to-right, top-to-bottom order.  Pixel values should be in floating-point, and
    normalized between 0 and 1.

    The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
    cols).

    """

    img_shape = None # (rows, cols)


    """
    TIMESERIES
    ----------

    When dealing with examples which are themselves timeseries, put each example timeseries in a
    tensor and make a list of them.  Generally use tensors, and resort to lists or arrays
    wherever different 
    """