pylearn: datasets/dataset.py comparison

comparison datasets/dataset.py @ 518:4aa7f74ea93f

init dataset

author	James Bergstra <bergstrj@iro.umontreal.ca>
date	Wed, 12 Nov 2008 12:36:09 -0500
parents
children

comparison

equal deleted inserted replaced

-:716c04512dbe
+:4aa7f74ea93f
+"""The dataset-from-descriptor mechanism."""
+_factory = {}
+def add_dataset_factory(tok0, fn):
+"""Add `fn` as the handler for descriptors whose first token is `tok0`.
+:returns: None
+"""
+if tok0 in _factory:
+raise Exception('Identifier already in use:', tok0)
+else:
+_factory[tok0] = fn
+def dataset_factory(tok0):
+"""Register a function as the handler for a given kind of dataset, identified by `tok0`.
+When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
+then the handler registered for 'kind_of_dataset' will be called with the same arguments as
+dataset_from_descr.
+.. code-block:: python
+@dataset_factory('MNIST')
+def mnist_related_dataset(descr, **kwargs):
+...
+:returns: `dectorator`
+"""
+def decorator(fn):
+add_dataset_factory(tok0, fn)
+return fn
+return decorator
+def dataset(descr, **kwargs):
+"""Return the dataset described by `descr`.
+:param descr: a dataset identifier
+:type descr: str
+:returns: `Dataset`
+"""
+tok0 = descr.split()[0]
+fn = _factory[tok0]
+return fn(descr, **kwargs)
+class Dataset(object):
+"""Dataset is a generic container for pylearn datasets.
+It is not intended to put any restriction whatsoever on its contents.
+It is intended to encourage certain conventions, described below.  Conventions should arise
+naturally among datasets in PyLearn.  When a few datasets adhere to a new convention, then
+describe it here and make it more official.
+If no particular convention applies.  Create your own object to store the dataset, and
+assign it to the `data` attribute.
+"""
+data = None
+"""
+SIMPLE REGRESSION / CLASSIFICATION
+----------------------------------
+In this setting, you are aiming to do vector classification or vector regression
+where your train, valid and test sets fit in memory.
+The convention is to put your data into numpy ndarray instances.  Put training data in the
+`train` attribute,  validation data in the `valid` attribute and test data in the `test
+attribute`.
+Each of those attributes should be an instance that defines at least two attributes: `x` for the
+input matrix and `y` for the target matrix.  The `x` ndarray should be one example per
+leading index (row for matrices).
+The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
+If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
+If there are weights associated with different examples, then create a 'weights' attribute whose
+value is a vector with one floating-point value (typically double-precision) per example.
+If the task is classification, then the classes should be mapped to the integers
+0,1,...,N-1.
+The number of classes (here, N) should be stored in the `n_classes` attribute.
+"""
+train = None #instance with .x, .y
+valid = None #instance with .x, .y
+test = None #instance with .x, .y
+n_classes = None  #int
+"""
+WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
+-------------------------------------------
+In this setting we typically encode images as vectors, by enumerating the pixel values in
+left-to-right, top-to-bottom order.  Pixel values should be in floating-point, and
+normalized between 0 and 1.
+The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
+cols).
+"""
+img_shape = None # (rows, cols)
+"""
+TIMESERIES
+----------
+When dealing with examples which are themselves timeseries, put each example timeseries in a
+tensor and make a list of them.  Generally use tensors, and resort to lists or arrays
+wherever different
+"""

Mercurial > pylearn

comparison datasets/dataset.py @ 518:4aa7f74ea93f