# HG changeset patch # User Arnaud Bergeron # Date 1284492168 14400 # Node ID c1943feada100d9e14a1b172c2542f762f2edff1 # Parent 18a092001752cd5f63b89157eed4dbac57541f97 Proposal for theano dataset wrapper. The details still have to be worked out. diff -r 18a092001752 -r c1943feada10 doc/v2_planning/dataset.txt --- a/doc/v2_planning/dataset.txt Tue Sep 14 14:20:31 2010 -0400 +++ b/doc/v2_planning/dataset.txt Tue Sep 14 15:22:48 2010 -0400 @@ -368,7 +368,8 @@ AB: I have an idea about this which kind of fits in the "building a theano op" thing that we talked about at the last meeting. -We could have a specialezed theano op that takes a dataset and returns -chunks of it with a index using the standard Dataset interface. The -code to transfer to the GPU or whatever goes in that Op and we don't -need to change to dataset interface. +We can just build a theano Op that wraps dataset objects and takes +care of the details of tranferring data to the GPU or otherwise. + +I have a prototype interface/implemantation in the shared_dataset.py +file in this directory. diff -r 18a092001752 -r c1943feada10 doc/v2_planning/shared_dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/shared_dataset.py Tue Sep 14 15:22:48 2010 -0400 @@ -0,0 +1,47 @@ +import theano + +# This is not final and may not even run for now. It is just to give +# a feeling of what the interface could look like. + +def shared_dataset(dataset, mem_size): + if dataset.total_size > mem_size: + return OnlineDataset(dataset) + else: + return MemoryDataset(dataset) + +class MemoryDataset(theano.Op): + def __init__(self, dataset): + self.input = theano.shared(dataset.input) + self.output = theano.shared(dataset.output) + self.batch_size = dataset.batch_size + + def make_node(self, idx): + idx_ = theano.as_tensor_variable(idx) + return theano.Apply(self, + inputs = [idx_], + outputs = [self.input.type(), + self.output.type()]) + + def preform(self, node, inputs, output_storage): + idx, = inputs + self.output_storage[0][0] = self.input[idx*self.batch_size:(idx+1)*self.batch_size] + self.output_storage[1][0] = self.output[idx*self.batch_size:(idx+1)*self.batch_size] + +class OnlineDataset(theano.Op): + def __init__(self, dataset): + self.dataset = dataset + + def make_node(self, idx): + idx_ = theano.as_tensor_variable(idx) + return theano.Apply(self, + inputs = [idx_], + outputs = [theano.tensor.fmatrix(), + theano.tensor.fmatrix()]) + # fix this so its not fmatrix(), + # but whatever the dataset outputs + + def perform(self, node, inputs, output_storage): + idx, = inputs + b = self.dataset.get_batch(idx.value) + output_storage[0][0] = b.input + output_storage[1][0] = b.output