diff datasets/dataset.py @ 518:4aa7f74ea93f

init dataset
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 12 Nov 2008 12:36:09 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets/dataset.py	Wed Nov 12 12:36:09 2008 -0500
@@ -0,0 +1,118 @@
+"""The dataset-from-descriptor mechanism."""
+
+_factory = {}
+
+def add_dataset_factory(tok0, fn):
+    """Add `fn` as the handler for descriptors whose first token is `tok0`.
+
+    :returns: None
+
+    """
+    if tok0 in _factory:
+        raise Exception('Identifier already in use:', tok0)
+    else:
+        _factory[tok0] = fn
+
+def dataset_factory(tok0):
+    """Register a function as the handler for a given kind of dataset, identified by `tok0`.
+
+    When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
+    then the handler registered for 'kind_of_dataset' will be called with the same arguments as
+    dataset_from_descr.
+
+    .. code-block:: python
+        
+        @dataset_factory('MNIST')
+        def mnist_related_dataset(descr, **kwargs):
+            ...
+
+    :returns: `dectorator`
+    """
+    def decorator(fn):
+        add_dataset_factory(tok0, fn)
+        return fn
+    return decorator
+
+def dataset(descr, **kwargs):
+    """Return the dataset described by `descr`.
+
+    :param descr: a dataset identifier
+    :type descr: str
+    :returns: `Dataset`
+
+    """
+    tok0 = descr.split()[0]
+    fn = _factory[tok0]
+    return fn(descr, **kwargs)
+
+
+class Dataset(object):
+    """Dataset is a generic container for pylearn datasets.
+
+    It is not intended to put any restriction whatsoever on its contents.
+
+    It is intended to encourage certain conventions, described below.  Conventions should arise
+    naturally among datasets in PyLearn.  When a few datasets adhere to a new convention, then
+    describe it here and make it more official.
+
+    If no particular convention applies.  Create your own object to store the dataset, and
+    assign it to the `data` attribute.
+    """
+    data = None
+
+    """
+    SIMPLE REGRESSION / CLASSIFICATION
+    ----------------------------------
+
+    In this setting, you are aiming to do vector classification or vector regression
+    where your train, valid and test sets fit in memory.
+    The convention is to put your data into numpy ndarray instances.  Put training data in the
+    `train` attribute,  validation data in the `valid` attribute and test data in the `test
+    attribute`.
+    Each of those attributes should be an instance that defines at least two attributes: `x` for the
+    input matrix and `y` for the target matrix.  The `x` ndarray should be one example per
+    leading index (row for matrices).
+    The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
+    If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
+    
+    If there are weights associated with different examples, then create a 'weights' attribute whose
+    value is a vector with one floating-point value (typically double-precision) per example.
+
+    If the task is classification, then the classes should be mapped to the integers
+    0,1,...,N-1.
+    The number of classes (here, N) should be stored in the `n_classes` attribute.
+
+    """
+    train = None #instance with .x, .y
+
+    valid = None #instance with .x, .y
+
+    test = None #instance with .x, .y
+
+    n_classes = None  #int
+
+    """
+    WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
+    -------------------------------------------
+
+    In this setting we typically encode images as vectors, by enumerating the pixel values in
+    left-to-right, top-to-bottom order.  Pixel values should be in floating-point, and
+    normalized between 0 and 1.
+
+    The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
+    cols).
+
+    """
+
+    img_shape = None # (rows, cols)
+
+
+    """
+    TIMESERIES
+    ----------
+
+    When dealing with examples which are themselves timeseries, put each example timeseries in a
+    tensor and make a list of them.  Generally use tensors, and resort to lists or arrays
+    wherever different 
+    """
+