# HG changeset patch # User Olivier Delalleau # Date 1287680449 14400 # Node ID 7dfc3d3052eabe2974ac39313d75dc3765d33048 # Parent 09ad2a4f663cfc019aa5a9dbf9fa1d165d236280 Added proposal for dataset API as discussed on pylearn-dev diff -r 09ad2a4f663c -r 7dfc3d3052ea doc/v2_planning/dataset.txt --- a/doc/v2_planning/dataset.txt Mon Oct 18 19:31:17 2010 -0400 +++ b/doc/v2_planning/dataset.txt Thu Oct 21 13:00:49 2010 -0400 @@ -406,3 +406,165 @@ OD: I like AB's approach. + +Data API proposal by Olivier D +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A single sample containing multiple fields (e.g. an input and a target part) +is an object s that you can manipulate as follows: + +.. code-block:: python + + # Obtain actual data stored within `s` (e.g. a numpy vector). There is no + # guarantee that modifying the resulting data object will actually update + # the data stored in `s`. + data = s() + # Create a sample that sees a field of `s`. + input_part = s.input + # Obtain actual input data (e.g. as a numpy vector). + input_data = input_part() + # Create a sample that sees the i-th element of the data stored in `s`. + ith = s[i] + # This should not fail. + assert ith() == s()[i] + # You could also select a range. + i_to_j = s[i:j] + assert i_to_j() == s()[i:j] + # And actually do pretty much anything you want with __getitem__, as long + # as the underlying data stored in the sample supports it (for instance, + # here it should be at least a 3D tensor). + fancy_selection = s[i, :, j:k] + assert fancy_selection() == s()[i, :, j:k] + # Write some value (e.g. a numpy vector) into the sample. May raise an + # exception if the sample is in read-only mode. + s._write(val) + # Shortcut to write data into a field (same as `s.input._write(val)`). + s.input = val + # Basic mathematical operators. + s *= val + s += val + s -= val + s /= val + # Replace a field. Note that this is different from `s.input = val` + # because here `new_input` is a sample, not a numeric value: the current + # `s.input` will not be written to, instead it makes `s.input` point + # towards a different sample. This may lead to confusion, so a different + # syntax may be better (e.g. s._set_field('input', new_input)). + s.input = new_input + # The equality of two samples is defined by the equality of their + # underlying data. + def __eq__(self, other): + return self() == other() + # Iterate on fields (open question: should they be ordered?). + fields = dict([(name, sample) for name, sample in s._iter_fields()]) + assert fields['input'] == s.input + # Iterating on a sample yields samples that see consecutive elements. + for sample, value in izip(s, s()): + assert sample() == value + # The length of a sample is the same as that of its underlying data. + assert len(s) == len(s()) + # The shape of a sample is the same as that of its underlying data. + # Note that it only makes sense for tensor-like data. + assert s._shape() == s().shape + # The size of a sample is the product of its shape elements. + assert s._size() == reduce(operator.__mul__, s._shape()) + +All sample methods should start with '_', to differentiate them from the +sample's fields. This is a bit awkward, but I like the `sample.field` syntax +compared to something like "sample.get_field('field')", which makes code less +readable, especially when combining with sub_fields, e.g. `sample.input.x1` +vs. sample.get_field('input').get_field('x1'). + +The extension from sample to dataset is actually to use the same class, but +with the convention that the first "dimension" in the data seen by the dataset +corresponds to the samples' indices in the dataset. + +.. code-block:: python + + # Return data stored in dataset `d` (e.g. a numpy matrix). + data = d() + # Return the i-th sample in the dataset. + s = d[i] + # Data should match! + assert data[i] == s() + # Return a subset of the dataset. + sub_data = d[i:j] + # Advanced indexing. + sub_data = d[some_list_of_indices] + # Dataset that sees the input part only. + input_part = d.input + # Dataset such that its i-th element is data[i][something] (see the sample + # examples for what `something` may be). + some_sub_data = d[:, something] + # The following should not fail. + assert d[i, something] == d[i][something] # == some_sub_data[i] + # You can also write into a dataset. + d._write(val) + d.input = val + # Center dataset in-place (requires `d` not to be read-only). + d -= numpy.mean(d()) + # The length of a dataset is its number of samples. + n_samples = len(d) + # The width of a dataset (if it exists) is the length of its samples. + assert d._shape()[1] == len(d[0]) # == d._width() (shortcut) + # Iterating on a dataset yields individual samples. + for i, sample in enumerate(d): + assert d[i] == sample + # It is allowed for a dataset to hold heterogeneous data. For instance + # you could have + len(d.data1) != len(d.data2) + # A sample in the dataset is not required to inherit all the dataset's + # fields, for instance in the case above you could decide that the dataset + # sees the same data as its first sub-dataset, i.e. + d[i] == d.data1[i] + +There remain some fuzzy points. For instance, are fields allowed to overlap? +(e.g. so that one could write both s.pos_3d to get the 3d vector coordinate of +sample s, and s.x to get the x coordinate without being forced to go through +s.pos_3d.x). What are the fields of s[i:j] if the (i, j) range does not +exactly match a subset of fields? How do we handle metadata? (e.g. if we want +to describe the dataset to say it contains 28x28 image data, so that an +algorithm for filter visualization can automatically deal with it) + +Now, on to some use cases. + +.. code-block:: python + + # Mini-batches. + mb_dataset = d._minibatches(batch_size=5) + # The mini-batch dataset views samples that are mini-batches. + assert mb_dataset[0]() == d[0:5]() # As long as len(d) >= 5. + + # Shuffling samples. + random_indices = range(len(d)) + random_indices = numpy.random.shuffle(random_indices) + shuffled_dataset = d[random_indices] + + # Typical linear regression with stochastic gradient descent. + n_inputs = d.input._width() + n_targets = d.target._width() + weights = numpy.zeros((n_inputs, n_targets)) + bias = numpy.zeros(n_targets) + mb_dataset = d._minibatches(batch_size=10) + # Note: it is important to get the number of inputs / targets + # before converting to minibatches, because + # mb_dataset.input._width() == 10 + # since this is the length of a minibatch matrix. However you + # could still do the following, which is less readable: + # n_inputs = mb_dataset.input._shape()[2] + # You could also wait until you see the first sample to create + # the parameters (this would actually be a better way to do it, since + # it avoids calling the _width method). + for input, target in izip(mb_dataset.input, mb_dataset.target): + cost = (numpy.dot(input(), weights) + b - target())**2 + # Update weights and bias depending on cost.... + +A few more points: + - Infinite datasets could be used (would just need to define a convention + on what __len__ should do). + - It is also ok to have datasets that do not support random access (so the + only way to access samples is through iteration). + - Ideally, data should be deterministic (i.e. __call__() should always + return the same thing). It would probably be up to the user to be super + careful if he decides to use a non-deterministic dataset. +