# HG changeset patch # User James Bergstra # Date 1226511369 18000 # Node ID 4aa7f74ea93fb404122c8889b11045244939b80e # Parent 716c04512dbe9787272a437358507a928dcbb210 init dataset diff -r 716c04512dbe -r 4aa7f74ea93f datasets/__init__.py --- a/datasets/__init__.py Wed Nov 12 10:54:38 2008 -0500 +++ b/datasets/__init__.py Wed Nov 12 12:36:09 2008 -0500 @@ -0,0 +1,1 @@ +from dataset import dataset, Dataset diff -r 716c04512dbe -r 4aa7f74ea93f datasets/dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/dataset.py Wed Nov 12 12:36:09 2008 -0500 @@ -0,0 +1,118 @@ +"""The dataset-from-descriptor mechanism.""" + +_factory = {} + +def add_dataset_factory(tok0, fn): + """Add `fn` as the handler for descriptors whose first token is `tok0`. + + :returns: None + + """ + if tok0 in _factory: + raise Exception('Identifier already in use:', tok0) + else: + _factory[tok0] = fn + +def dataset_factory(tok0): + """Register a function as the handler for a given kind of dataset, identified by `tok0`. + + When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1), + then the handler registered for 'kind_of_dataset' will be called with the same arguments as + dataset_from_descr. + + .. code-block:: python + + @dataset_factory('MNIST') + def mnist_related_dataset(descr, **kwargs): + ... + + :returns: `dectorator` + """ + def decorator(fn): + add_dataset_factory(tok0, fn) + return fn + return decorator + +def dataset(descr, **kwargs): + """Return the dataset described by `descr`. + + :param descr: a dataset identifier + :type descr: str + :returns: `Dataset` + + """ + tok0 = descr.split()[0] + fn = _factory[tok0] + return fn(descr, **kwargs) + + +class Dataset(object): + """Dataset is a generic container for pylearn datasets. + + It is not intended to put any restriction whatsoever on its contents. + + It is intended to encourage certain conventions, described below. Conventions should arise + naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then + describe it here and make it more official. + + If no particular convention applies. Create your own object to store the dataset, and + assign it to the `data` attribute. + """ + data = None + + """ + SIMPLE REGRESSION / CLASSIFICATION + ---------------------------------- + + In this setting, you are aiming to do vector classification or vector regression + where your train, valid and test sets fit in memory. + The convention is to put your data into numpy ndarray instances. Put training data in the + `train` attribute, validation data in the `valid` attribute and test data in the `test + attribute`. + Each of those attributes should be an instance that defines at least two attributes: `x` for the + input matrix and `y` for the target matrix. The `x` ndarray should be one example per + leading index (row for matrices). + The `y` ndarray should be one target per leading index (entry for vectors, row for matrices). + If `y` is a classification target, than it should be a vector with numpy dtype 'int32'. + + If there are weights associated with different examples, then create a 'weights' attribute whose + value is a vector with one floating-point value (typically double-precision) per example. + + If the task is classification, then the classes should be mapped to the integers + 0,1,...,N-1. + The number of classes (here, N) should be stored in the `n_classes` attribute. + + """ + train = None #instance with .x, .y + + valid = None #instance with .x, .y + + test = None #instance with .x, .y + + n_classes = None #int + + """ + WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES + ------------------------------------------- + + In this setting we typically encode images as vectors, by enumerating the pixel values in + left-to-right, top-to-bottom order. Pixel values should be in floating-point, and + normalized between 0 and 1. + + The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows, + cols). + + """ + + img_shape = None # (rows, cols) + + + """ + TIMESERIES + ---------- + + When dealing with examples which are themselves timeseries, put each example timeseries in a + tensor and make a list of them. Generally use tensors, and resort to lists or arrays + wherever different + """ +