# HG changeset patch # User Dumitru Erhan # Date 1283969642 14400 # Node ID 0464f891129b9aee5402fbca9ee36a49461a7d25 # Parent 1b61cbe0810ba5aef0f894406cdef72269d7e207# Parent f1732269bce8ef7f07670b74e4cfe8bd2b546204 merge diff -r f1732269bce8 -r 0464f891129b doc/v2_planning/dataset.txt --- a/doc/v2_planning/dataset.txt Wed Sep 08 13:50:13 2010 -0400 +++ b/doc/v2_planning/dataset.txt Wed Sep 08 14:14:02 2010 -0400 @@ -18,3 +18,138 @@ Commiteee: DE, OB, OD, AB, PV Leader: DE + +Some ideas from existing ML libraries: + +- PyML: notion of dataset containers: VectorDataSet, SparseDataSet, KernelData, + PairDataSet, Aggregate. Ultimately, the learner decides +- mlpy: very primitive notions of data +- (still going through the other ones) + +A few things that our dataset containers should support at a minimum: + + - streams, possibly infinite + - task/views of the data for different problems + - indexing & slicing + - pairs or triples or etc of examples + - a 'distance/gram matrix' container (imagine that the data is given to you + as a distance matrix) + - multi-dimensional time-series (again, maybe with pairs/triples, maybe + given to you as a distance matrix over time) + +Another question to consider is the following: how tight should it integrate +with Theano? Do we want to be able to store data as shared variables or just +have an option for that? Theano + GPU constrains things that we can do (in terms +of sizes, buffering, etc): these are things we need to think about, but it's not +clear whether we should aim for building them into the interface. + +Task views of the data for different problems: How can we achieve this? Should +we simply have a set of standard dataset descriptors ('classification', +'regression', 'multi-label', 'density_estimation') and have a set_view method +that changes the current dataset view type? + +There is then the question of how to approach the design of a Dataset class from +an OOP perspective. So far, my (Dumi's) idea is to have an almost 'abstract class' +Dataset that doesn't implement any methods except a few setters/getters. The reason +to have the methods listed that way is to have a common 'specification', but classes +that inherit from Dataset need not implement every single method (only the ones +that are relevant) and can obviously implement other methods as appropriate. The +reason to have a common specification (as abstract as it might be) is to, well, +have a common specification that would make our code clearer and cleaner. + +An example of what I (Dumi) am thinking in terms of concrete API: + +class Dataset: + def __init__(self): + self.type = None + self.in_memory = None + self.inputs = None # list of filepaths, or objects in memory, or... + self.outputs = None + + def get_example(self,example_index): + raise NotImplementedError() + + def get_next_example(self): + raise NotImplementedError() + + def get_batch(self,batch_index): + raise NotImplementedError() + + def get_next_batch(self): + raise NotImplementedError() + + def get_slice(self,slice_object): + raise NotImplementedError() + + def set_view(self,view_type): + self.view_type = view_type + self.n_classes = None + + def set_n_classes(self,n_classes): + self.n_classes = n_classes + + def set_batch_size(self,batch_size): + self.batch_size = batch_size + +You will note that there is no notion of train/valid/test in this class: I think we should +just have a train dataset, a valid one and a test one instead or (if it's in one +big file or infinite stream) just handle the split ourselves (via slicing, for +instance). I (Dumi) am of the opinion that it keeps things cleaner, but the +specification does not preclude more fine-grained 'splitting' of the data. + +A concrete implementation would look like this (we would have one class per +dataset that we use, and the class declaration contains essentially everything +there is to know about the dataset): + +class MNIST(Dataset): + def __init__(self,inputs=['train_x.npy'],outputs=['train_y.npy']): + self.type='standard_xy' + self.in_memory = True + self.inputs = inputs # load them or create + self.outputs = outputs + self.set_view('classification') + self.set_n_classes(10) + self.set_batch_size(20) + self.n_batches = self._compute_n_batches() + + def get_batch(self,batch_index): + x,y = self._fetch_batch(batch_index) + if self.view_type == 'classification': + return x,numpy.int32(y) + elif self.view_type == 'density_estimation': + return x + else: + raise NotImplementedError() + + def shared_data(self): + shared_x = theano.shared(numpy.asarray(self.inputs, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(self.outputs, dtype=theano.config.floatX)) + return shared_x, T.cast(shared_y, 'int32') + + def _compute_n_batches(self): + pass + + def _fetch_batch(self,batch_index): + pass + +But nothing stops you from defining get_train_batch, get_valid_batch and stuff +like that! + +So we'd use it as: + +train_mnist = MNIST(inputs = ['train_x.npy'], outputs = ['train_y.npy']) +valid_mnist = MNIST(inputs = ['valid_x.npy'], outputs = ['valid_y.npy']) + +x,y = train_mnist.get_batch(0) +train_mnist.set_view('density_estimation') +x = train_mnist.get_batch(0) + +or + +mnist_data = MNIST(inputs = ['x.npy'], outputs = ['y.npy']) +batches_train = range(int(mnist_data.n_batches*0.8)) +batches_valid = range(int(mnist_data.n_batches*0.8),mnist_data.n_batches) + +xt,yt = mnist_data.get_batch(batches_train[0]) +xv,yv = mnist_data.get_batch(batches_valid[0]) +