annotate datasets/dataset.py @ 672:27b1344a57b1

Added preprocessing back in
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:38:06 -0500
parents 4aa7f74ea93f
children
rev   line source
518
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
1 """The dataset-from-descriptor mechanism."""
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
2
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
3 _factory = {}
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
4
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
5 def add_dataset_factory(tok0, fn):
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
6 """Add `fn` as the handler for descriptors whose first token is `tok0`.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
7
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
8 :returns: None
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
9
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
10 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
11 if tok0 in _factory:
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
12 raise Exception('Identifier already in use:', tok0)
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
13 else:
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
14 _factory[tok0] = fn
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
15
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
16 def dataset_factory(tok0):
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
17 """Register a function as the handler for a given kind of dataset, identified by `tok0`.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
18
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
19 When someone calls dataset_from_descr('kind_of_dataset option1 option2, etc.', approx=1),
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
20 then the handler registered for 'kind_of_dataset' will be called with the same arguments as
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
21 dataset_from_descr.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
22
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
23 .. code-block:: python
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
24
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
25 @dataset_factory('MNIST')
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
26 def mnist_related_dataset(descr, **kwargs):
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
27 ...
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
28
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
29 :returns: `dectorator`
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
30 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
31 def decorator(fn):
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
32 add_dataset_factory(tok0, fn)
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
33 return fn
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
34 return decorator
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
35
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
36 def dataset(descr, **kwargs):
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
37 """Return the dataset described by `descr`.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
38
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
39 :param descr: a dataset identifier
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
40 :type descr: str
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
41 :returns: `Dataset`
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
42
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
43 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
44 tok0 = descr.split()[0]
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
45 fn = _factory[tok0]
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
46 return fn(descr, **kwargs)
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
47
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
48
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
49 class Dataset(object):
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
50 """Dataset is a generic container for pylearn datasets.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
51
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
52 It is not intended to put any restriction whatsoever on its contents.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
53
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
54 It is intended to encourage certain conventions, described below. Conventions should arise
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
55 naturally among datasets in PyLearn. When a few datasets adhere to a new convention, then
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
56 describe it here and make it more official.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
57
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
58 If no particular convention applies. Create your own object to store the dataset, and
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
59 assign it to the `data` attribute.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
60 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
61 data = None
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
62
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
63 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
64 SIMPLE REGRESSION / CLASSIFICATION
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
65 ----------------------------------
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
66
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
67 In this setting, you are aiming to do vector classification or vector regression
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
68 where your train, valid and test sets fit in memory.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
69 The convention is to put your data into numpy ndarray instances. Put training data in the
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
70 `train` attribute, validation data in the `valid` attribute and test data in the `test
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
71 attribute`.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
72 Each of those attributes should be an instance that defines at least two attributes: `x` for the
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
73 input matrix and `y` for the target matrix. The `x` ndarray should be one example per
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
74 leading index (row for matrices).
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
75 The `y` ndarray should be one target per leading index (entry for vectors, row for matrices).
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
76 If `y` is a classification target, than it should be a vector with numpy dtype 'int32'.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
77
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
78 If there are weights associated with different examples, then create a 'weights' attribute whose
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
79 value is a vector with one floating-point value (typically double-precision) per example.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
80
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
81 If the task is classification, then the classes should be mapped to the integers
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
82 0,1,...,N-1.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
83 The number of classes (here, N) should be stored in the `n_classes` attribute.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
84
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
85 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
86 train = None #instance with .x, .y
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
87
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
88 valid = None #instance with .x, .y
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
89
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
90 test = None #instance with .x, .y
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
91
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
92 n_classes = None #int
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
93
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
94 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
95 WHEN INPUTS ARE FIXED-SIZE GREYSCALE IMAGES
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
96 -------------------------------------------
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
97
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
98 In this setting we typically encode images as vectors, by enumerating the pixel values in
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
99 left-to-right, top-to-bottom order. Pixel values should be in floating-point, and
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
100 normalized between 0 and 1.
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
101
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
102 The shape of the images should be recorded in the `img_shape` attribute as a tuple (rows,
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
103 cols).
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
104
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
105 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
106
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
107 img_shape = None # (rows, cols)
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
108
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
109
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
110 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
111 TIMESERIES
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
112 ----------
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
113
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
114 When dealing with examples which are themselves timeseries, put each example timeseries in a
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
115 tensor and make a list of them. Generally use tensors, and resort to lists or arrays
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
116 wherever different
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
117 """
4aa7f74ea93f init dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
118