view doc/v2_planning/dataset.py @ 1207:53937045f6c7

Pasted content of email sent by Ian about existing python ML libraries
author Olivier Delalleau <delallea@iro>
date Tue, 21 Sep 2010 10:58:14 -0400
parents 9686c0d9689d
children
line wrap: on
line source

class DataColumn(object):
    def __init__(self, loader, **metadata):
        self.loader = loader
        self.batch_size = 0
        self.modulo = None
        self.metadata = metadata

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return self.loader.getitem(idx.start, idx.stop)
        else:
            return self.loader.getitem(idx, idx+1)

    def group(size, modulo):
        self.batch_size = size
        self.module = modulo

    def __iter__(self):
        return DsetIter(self, self.size, self.modulo)

    def get_sub_datacolumn(self, i, j):
        return DataColumn(DsetLoader(self, i, j), **self.metadata)

    def length(self):
        return self.loader.length()

class DsetIter(object):
    def __init__(self, dset, size, modulo):
        self.dset = dset
        self.size = size
        self.modulo = modulo
        self.pos = 0

    def __iter__(self):
        return self

    def next(self):
        res = self.dset[self.pos*self.size:(self.pos+self.size)*self.size]
        if len(res) == 0:
            raise StopIteration
        if len(res) != self.size:
            if modulo == True:
                return res
            if modulo == False:
                raise StopIteration

class DsetLoader(object):
    def __init__(self, dset, start, stop):
        self.dset = dset
        self.start = start
        self.stop = stop
        if self.start is None:
            self.start = 0
        if self.stop is None:
            self.stop = self.dset.length()
        
    def getitem(self, i, j):
        if self.stop is None:
            return self.dset[self.start+i:self.start+j]
        else:
            return self.dset[min(self.start+i, self.stop):
                             min(self.start+j, self.stop)]

    def length(self):
        if self.stop is None:
            return None
        else:
            return self.stop - self.start

class Dataset(object):
    def __init__(self, cols):
        self.cols = cols

    def get_sub_dataset(self, start, stop):
        return Dataset([c.get_sub_datacolumn(start, stop) for c in self.cols])

    # we could have the __getitem__ interface and the iterator interface also