view dataset.py @ 11:be128b9127c8

Debugged (to the extent of my tests) the new version of dataset
author bengioy@esprit.iro.umontreal.ca
date Wed, 26 Mar 2008 15:01:30 -0400
parents de616c423dbd
children ff4e551490f1 813723310d75
line wrap: on
line source


class Example(object):
    """
    An example is something that is like a tuple but whose elements can be named, to that
    following syntactic constructions work as one would expect:
       example.x = [1, 2, 3] # set a field
       x, y, z = example
       x = example[0]
       x = example["x"]
    """
    def __init__(self,names,values):
        assert len(values)==len(names)
        self.__dict__['values']=values
        self.__dict__['fields']={}
        for i in xrange(len(values)):
            self.fields[names[i]]=i
            
    def __getitem__(self,i):
        if isinstance(i,int):
            return self.values[i]
        else:
            return self.values[self.fields[i]]
    
    def __setitem__(self,i,value):
        if isinstance(i,int):
            self.values[i]=value
        else:
            self.values[self.fields[i]]=value

    def __getattr__(self,name):
        return self.values[self.fields[name]]

    def __setattr__(self,name,value):
        self.values[self.fields[name]]=value

    def __len__(self):
        return len(self.values)

    
class DataSet(object):
    """
    This is a virtual base class or interface for datasets.
    A dataset is basically an iterator over Examples (or anything that
    behaves like an Example). It does not necessarily
    have a fixed length (this is useful for 'streams' which feed on-line learning).
    Datasets with fixed and known length are instances of FiniteDataSet, a subclass
    which supports indexing (dataset[i]) and slicing (dataset[1000:2000]).
    To iterate over a subset of the fields, one should use the dataset.zip(field1, field2,field3, ...)
    method which returns an iterator over only the desired fields.
    Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.
    The content of a field can be of any type, but often will be a numpy array.
    If one iterates through minibatches of examples (with the minibatches() method
    or with the minibatch_size argument of the zip() method), then the fields
    returned by the iterator's next method should be iterators over the 
    individual values within the minibatch (typically these will be arrays
    with minibatch_size rows).
    """

    def __init__(self):
        pass
    
    def __iter__(self):
        """
        Return an iterator, whose next() method returns the next example or the next 
        minibatch in the dataset. A minibatch (of length > 1) is also an example, but
        whose fields should be something one can iterate on again in order to obtain
        the individual examples.
        """
        raise NotImplementedError

    def zip(self,*fieldnames):
        """
        Return an iterator which sees only the specified fields (each fieldname is a
        field key, typically a string). The value returned at each iteration
        is a tuple with one element per field. Hence it can be used like this:
           for f1, f2, f3 in dataset.zip('field1','field2','field3'):
              ... use f1, f2, and f3
        """
        raise NotImplementedError

    def minibatches(self,minibatch_size,*fieldnames):
        """
        Similar to zip but iterate over minibatches.
        Return a minibatch iterator, whose next() method returns an 'example'
        whose fields are iteratable objects (which can iterate over the individual
        values of that field in the minibatch).
        """
        raise NotImplementedError
    
    def fieldNames(self):
        """Return the list of field names in the examples of this dataset."""
        raise NotImplementedError

    def rename(*new_field_specifications):
        """
        Return a new dataset that maps old fields (of self) to new fields (of the returned 
        dataset). The minimal syntax that should be supported is the following:
           new_field_specifications = [new_field_spec1, new_field_spec2, ...]
           new_field_spec = ([old_field1, old_field2, ...], new_field)
        In general both old_field and new_field should be strings, but some datasets may also
        support additional indexing schemes within each field (e.g. column slice
        of a matrix-like field).
        """
        raise NotImplementedError

class FiniteDataSet(DataSet):
    """
    Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
    Examples are indexed by an integer between 0 and self.length()-1,
    and a subdataset can be obtained by slicing. This may not be appropriate in general
    but only for datasets which can be thought of like ones that access rows AND fields
    in an efficient random access way. Users are encouraged to expect only the generic dataset
    interface in general. A FiniteDataSet is mainly useful when one has to obtain
    a subset of examples (e.g. for splitting a dataset into training and test sets).
    """

    def __init__(self):
        pass

    def __iter__(self):
        return FiniteDataSetIterator(self)
    
    def zip(self,*fieldnames):
        return FiniteDataSetIterator(self,1,fieldnames)

    def minibatches(self,minibatch_size,*fieldnames):
        return FiniteDataSetIterator(self,minibatch_size,fieldnames)

    def __getattr__(self,fieldname):
        """Return an that can iterate over the values of the field in this dataset."""
        return self(fieldname)

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields."""
        raise NotImplementedError

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        raise NotImplementedError
    
    def __getitem__(self,i):
        """dataset[i] returns the (i+1)-th example of the dataset."""
        raise NotImplementedError

    def __getslice__(self,*slice_args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        raise NotImplementedError

class FiniteDataSetIterator(object):
    """
    If the fieldnames list is empty, it means that we want to see ALL the fields.
    """
    def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
        self.dataset=dataset
        self.minibatch_size=minibatch_size
        assert minibatch_size>=1 and minibatch_size<=len(dataset)
        self.current = -self.minibatch_size
        self.fieldnames = fieldnames

    def __iter__(self):
        return self
    
    def next(self):
        self.current+=self.minibatch_size
        if self.current>=len(self.dataset):
            self.current=-self.minibatch_size
            raise StopIteration
        if self.minibatch_size==1:
            complete_example=self.dataset[self.current]
        else:
            complete_example=self.dataset[self.current:self.current+self.minibatch_size]
        if self.fieldnames:
            return Example(self.fieldnames,list(complete_example))
        else:
            return complete_example


# we may want ArrayDataSet defined in another python file

import numpy

class ArrayDataSet(FiniteDataSet):
    """
    An ArrayDataSet behaves like a numpy array but adds the notion of named fields
    from DataSet (and the ability to view multiple field values as an 'Example').
    It is a  fixed-length and fixed-width dataset 
    in which each element is a numpy array or a number, hence the whole 
    dataset corresponds to a numpy array. Fields
    must correspond to a slice of array columns. If the dataset has fields,
    each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
    Any dataset can also be converted to a numpy array (losing the notion of fields
    by the numpy.array(dataset) call.
    """

    def __init__(self,dataset=None,data=None,fields={}):
        """
        There are two ways to construct an ArrayDataSet: (1) from an
        existing dataset (which may result in a copy of the data in a numpy array),
        or (2) from a numpy.array (the data argument), along with an optional description
        of the fields (dictionary of column slices indexed by field names).
        """
        if dataset!=None:
            assert data==None and fields=={}
            # Make ONE big minibatch with all the examples, to separate the fields.
            n_examples=len(dataset)
            batch = dataset.minibatches(n_examples).next()
            # Each field of the underlying dataset must be convertible to a numpy array of the same type
            # currently just double, but should use the smallest compatible dtype
            n_fields = len(batch)
            fieldnames = batch.fields.keys()
            total_width = 0
            type = None
            for i in xrange(n_fields):
                field = array(batch[i])
                assert field.shape[0]==n_examples
                width = field.shape[1]
                start=total_width
                total_width += width
                fields[fieldnames[i]]=slice(start,total_width,1)
            # many complicated things remain to be done:
            #  - find common dtype
            #  - decide what to do with extra dimensions if not the same in all fields
            #  - try to see if we can avoid the copy?
            raise NotImplementedError
        if data!=None:
            assert dataset==None
            self.data=data
            self.fields=fields
            self.width = data.shape[1]
            for fieldname in fields:
                fieldslice=fields[fieldname]
                # make sure fieldslice.start and fieldslice.step are defined
                start=fieldslice.start
                step=fieldslice.step
                if not start:
                    start=0
                if not step:
                    step=1
                if not fieldslice.start or not fieldslice.step:
                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                # and coherent with the data array
                assert fieldslice.start>=0 and fieldslice.stop<=self.width

    def __getattr__(self,fieldname):
        """
        Return a numpy array with the content associated with the given field name.
        If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
        than the dataset itself) is returned.
        """
        if len(self.data)==1:
            return self.data[0,self.fields[fieldname]]
        return self.data[:,self.fields[fieldname]]

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields."""
        min_col=self.data.shape[1]
        max_col=0
        for field_slice in self.fields.values():
            min_col=min(min_col,field_slice.start)
            max_col=max(max_col,field_slice.stop)
        new_fields={}
        for field in self.fields:
            new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)

    def fieldNames(self):
        """Return the list of field names that are supported by getattr and getFields."""
        return self.fields.keys()

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        return len(self.data)
    
    def __getitem__(self,i):
        """
        dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
        the result is just a numpy array (for the i-th row of the dataset data matrix).
        """
        if self.fields:
            fieldnames,fieldslices=zip(*self.fields.items())
            return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
        else:
            return self.data[i]

    def __getslice__(self,*slice_args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)

    def __array__(self):
        if not self.fields:
            return self.data
        # else, select subsets of columns mapped by the fields
        columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
        for field_slice in self.fields.values():
            for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
                columns_used[c]=True
        # try to figure out if we can map all the slices into one slice:
        mappable_to_one_slice = True
        start=0
        while start<len(columns_used) and not columns_used[start]:
            start+=1
        stop=len(columns_used)
        while stop>0 and not columns_used[stop-1]:
            stop-=1
        step=0
        i=start
        while i<stop:
            j=i+1
            while j<stop and not columns_used[j]:
                j+=1
            if step:
                if step!=j-i:
                    mappable_to_one_slice = False
                    break
            else:
                step = j-i
            i=j
        if mappable_to_one_slice:
            return self.data[:,slice(start,stop,step)]
        # else make contiguous copy
        n_columns = sum(columns_used)
        result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
        print result.shape
        c=0
        for field_slice in self.fields.values():
            slice_width=field_slice.stop-field_slice.start/field_slice.step
            # copy the field here
            result[:,slice(c,slice_width)]=self.data[:,field_slice]
            c+=slice_width
        return result