comparison pylearn/datasets/dataset.py @ 537:b054271b2504

new file structure layout, factories, etc.
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 12 Nov 2008 21:57:54 -0500
parents datasets/dataset.py@4aa7f74ea93f
children 4e3a3d9fef43
comparison
equal deleted inserted replaced
518:4aa7f74ea93f 537:b054271b2504
1 """The dataset-from-descriptor mechanism."""
2
3 _datasets = {}
4
5 def add_dataset_factory(family, fn):
6 """Add `fn` as the handler for descriptors whose first token is `family`.
7
8 :returns: None
9
10 """
11 if family in _datasets:
12 raise Exception('dataset identifier already in use:', family)
13 else:
14 _datasets[family] = fn
15
16 def dataset_factory(family):
17 """Register a function as the handler for a given kind of dataset, identified by `family`.
18
19 When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
20 then the handler registered for 'kind_of_dataset' will be called with the same arguments as
21 dataset_from_descr.
22
23 .. code-block:: python
24
25 @dataset_factory('MNIST')
26 def mnist_related_dataset(descr, **kwargs):
27 ...
28
29 :returns: `dectorator`
30 """
31 def decorator(fn):
32 add_dataset_factory(family, fn)
33 return fn
34 return decorator
35
36 def make_dataset(family, **kwargs):
37 """Return the dataset described by `descr`.
38
39 :param descr: a dataset identifier
40 :type descr: str
41 :returns: `Dataset`
42
43 """
44 return _datasets[family](**kwargs)
45
46
47 class Dataset(object):
48 class Obj(object):
49 def __init__(self, **kwargs):
50 self.__dict__.update(kwargs)
51
52 """Dataset is a generic container for pylearn datasets.
53
54 It is not intended to put any restriction whatsoever on its contents.
55
56 It is intended to encourage certain conventions, described below. Conventions should arise
57 naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then
58 describe it here and make it more official.
59
60 If no particular convention applies. Create your own object to store the dataset, and
61 assign it to the `data` attribute.
62 """
63 data = None
64
65 """
66 SIMPLE REGRESSION / CLASSIFICATION
67 ----------------------------------
68
69 In this setting, you are aiming to do vector classification or vector regression
70 where your train, valid and test sets fit in memory.
71 The convention is to put your data into numpy ndarray instances. Put training data in the
72 `train` attribute, validation data in the `valid` attribute and test data in the `test
73 attribute`.
74 Each of those attributes should be an instance that defines at least two attributes: `x` for the
75 input matrix and `y` for the target matrix. The `x` ndarray should be one example per
76 leading index (row for matrices).
77 The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
78 If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
79
80 If there are weights associated with different examples, then create a 'weights' attribute whose
81 value is a vector with one floating-point value (typically double-precision) per example.
82
83 If the task is classification, then the classes should be mapped to the integers
84 0,1,...,N-1.
85 The number of classes (here, N) should be stored in the `n_classes` attribute.
86
87 """
88 train = None #instance with .x, .y
89
90 valid = None #instance with .x, .y
91
92 test = None #instance with .x, .y
93
94 n_classes = None #int
95
96 """
97 WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
98 -------------------------------------------
99
100 In this setting we typically encode images as vectors, by enumerating the pixel values in
101 left-to-right, top-to-bottom order. Pixel values should be in floating-point, and
102 normalized between 0 and 1.
103
104 The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
105 cols).
106
107 """
108
109 img_shape = None # (rows, cols)
110
111
112 """
113 TIMESERIES
114 ----------
115
116 When dealing with examples which are themselves timeseries, put each example timeseries in a
117 tensor and make a list of them. Generally use tensors, and resort to lists or arrays
118 wherever different
119 """
120