comparison datasets/dataset.py @ 518:4aa7f74ea93f

init dataset
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 12 Nov 2008 12:36:09 -0500
parents
children
comparison
equal deleted inserted replaced
517:716c04512dbe 518:4aa7f74ea93f
1 """The dataset-from-descriptor mechanism."""
2
3 _factory = {}
4
5 def add_dataset_factory(tok0, fn):
6 """Add `fn` as the handler for descriptors whose first token is `tok0`.
7
8 :returns: None
9
10 """
11 if tok0 in _factory:
12 raise Exception('Identifier already in use:', tok0)
13 else:
14 _factory[tok0] = fn
15
16 def dataset_factory(tok0):
17 """Register a function as the handler for a given kind of dataset, identified by `tok0`.
18
19 When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
20 then the handler registered for 'kind_of_dataset' will be called with the same arguments as
21 dataset_from_descr.
22
23 .. code-block:: python
24
25 @dataset_factory('MNIST')
26 def mnist_related_dataset(descr, **kwargs):
27 ...
28
29 :returns: `dectorator`
30 """
31 def decorator(fn):
32 add_dataset_factory(tok0, fn)
33 return fn
34 return decorator
35
36 def dataset(descr, **kwargs):
37 """Return the dataset described by `descr`.
38
39 :param descr: a dataset identifier
40 :type descr: str
41 :returns: `Dataset`
42
43 """
44 tok0 = descr.split()[0]
45 fn = _factory[tok0]
46 return fn(descr, **kwargs)
47
48
49 class Dataset(object):
50 """Dataset is a generic container for pylearn datasets.
51
52 It is not intended to put any restriction whatsoever on its contents.
53
54 It is intended to encourage certain conventions, described below. Conventions should arise
55 naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then
56 describe it here and make it more official.
57
58 If no particular convention applies. Create your own object to store the dataset, and
59 assign it to the `data` attribute.
60 """
61 data = None
62
63 """
64 SIMPLE REGRESSION / CLASSIFICATION
65 ----------------------------------
66
67 In this setting, you are aiming to do vector classification or vector regression
68 where your train, valid and test sets fit in memory.
69 The convention is to put your data into numpy ndarray instances. Put training data in the
70 `train` attribute, validation data in the `valid` attribute and test data in the `test
71 attribute`.
72 Each of those attributes should be an instance that defines at least two attributes: `x` for the
73 input matrix and `y` for the target matrix. The `x` ndarray should be one example per
74 leading index (row for matrices).
75 The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
76 If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
77
78 If there are weights associated with different examples, then create a 'weights' attribute whose
79 value is a vector with one floating-point value (typically double-precision) per example.
80
81 If the task is classification, then the classes should be mapped to the integers
82 0,1,...,N-1.
83 The number of classes (here, N) should be stored in the `n_classes` attribute.
84
85 """
86 train = None #instance with .x, .y
87
88 valid = None #instance with .x, .y
89
90 test = None #instance with .x, .y
91
92 n_classes = None #int
93
94 """
95 WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
96 -------------------------------------------
97
98 In this setting we typically encode images as vectors, by enumerating the pixel values in
99 left-to-right, top-to-bottom order. Pixel values should be in floating-point, and
100 normalized between 0 and 1.
101
102 The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
103 cols).
104
105 """
106
107 img_shape = None # (rows, cols)
108
109
110 """
111 TIMESERIES
112 ----------
113
114 When dealing with examples which are themselves timeseries, put each example timeseries in a
115 tensor and make a list of them. Generally use tensors, and resort to lists or arrays
116 wherever different
117 """
118