view dataset.py @ 16:813723310d75

commenting
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 18:23:44 -0400
parents 88168361a5ab be128b9127c8
children 759d17112b23
line wrap: on
line source


class Example(object):
    """
    An example is something that is like a tuple but whose elements can be named, to that
    following syntactic constructions work as one would expect:
       example.x = [1, 2, 3] # set a field
       x, y, z = example
       x = example[0]
       x = example["x"]
    """
    def __init__(self,names,values):
        assert len(values)==len(names)
        self.__dict__['values']=values
        self.__dict__['fields']={}
        for i in xrange(len(values)):
            self.fields[names[i]]=i
            
    def __getitem__(self,i):
        if isinstance(i,int):
            return self.values[i]
        else:
            return self.values[self.fields[i]]
    
    def __setitem__(self,i,value):
        if isinstance(i,int):
            self.values[i]=value
        else:
            self.values[self.fields[i]]=value

    def __getattr__(self,name):
        return self.values[self.fields[name]]

    def __setattr__(self,name,value):
        self.values[self.fields[name]]=value

    def __len__(self):
        return len(self.values)

    
class DataSet(object):
    """A virtual base class for datasets.

    A DataSet is a generator of iterators; these iterators can run through the
    examples in a variety of ways.  A DataSet need not necessarily have a finite
    or known length, so this class can be used to interface to a 'stream' which
    feed on-line learning.

    To iterate over examples, there are several possibilities:
    - for i in dataset.zip(field1, field2,field3, ...)
    - for i in dataset.minibatches(N, field1, field2, ...)
    - for i in dataset
    Each of these is documented below.

    Note: For a dataset of fixed and known length, which can implement item
    random-access efficiently (e.g. indexing and slicing), and which can profit
    from the FiniteDataSetIterator, consider using base class FiniteDataSet.

    Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.

    Note: The content of a field can be of any type.

    """

    def __init__(self):
        pass
    
    def __iter__(self):
        """Supports the syntax "for i in dataset: ..."

        Using this syntax, "i" will be an Example instance (or equivalent) with
        all the fields of DataSet self.  Every field of "i" will give access to
        a the field of a single example.  Fields should be accessible via
        i[identifier], but the derived class is free to accept any type of
        identifier, and add extra functionality to the iterator.
        """
        raise NotImplementedError

    def zip(self, *fieldnames):
        """
        Supports two forms of syntax:

            for i in dataset.zip(f1, f2, f3): ...

            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...

        Using the first syntax, "i" will be an indexable object, such as a list,
        tuple, or Example instance, such that on every iteration, i[0] is the f1
        field of the current example, i[1] is the f2 field, and so on.

        Using the second syntax, i1, i2, i3 will contain the the contents of the
        f1, f2, and f3 fields of a single example on each loop iteration.

        The derived class may accept fieldname arguments of any type.

        """
        raise NotImplementedError

    def minibatches(self,minibatch_size,*fieldnames):
        """
        Supports two forms of syntax:

            for i in dataset.zip(f1, f2, f3): ...

            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...

        Using the first syntax, "i" will be an indexable object, such as a list,
        tuple, or Example instance, such that on every iteration, i[0] is the f1
        field of the current example, i[1] is the f2 field, and so on.

        Using the second syntax, i1, i2, i3 will contain the the contents of the
        f1, f2, and f3 fields of a single example on each loop iteration.

        The derived class may accept fieldname arguments of any type.

        Return an iterator, whose next() method returns the next example or the next 
        minibatch in the dataset. A minibatch (of length > 1) is also an example, but
        whose fields should be something one can iterate on again in order to obtain
        the individual examples.

        DataSet.zip returns an iterator over only the desired fields, and each field
        of the iterator contains one example.

        Return an iterator which sees only the specified fields (each fieldname is a
        field key, typically a string). The value returned at each iteration
        is a tuple with one element per field. Hence it can be used like this:
           for f1, f2, f3 in dataset.zip('field1','field2','field3'):
              ... use f1, f2, and f3
    If one iterates through minibatches of examples (with the minibatches() method
    or with the minibatch_size argument of the zip() method), then the fields
    returned by the iterator's next method should be iterators over the 
    individual values within the minibatch (typically these will be arrays
    with minibatch_size rows).
        Similar to zip but iterate over minibatches.
        Return a minibatch iterator, whose next() method returns an 'example'
        whose fields are iteratable objects (which can iterate over the individual
        values of that field in the minibatch).
        """
        raise NotImplementedError
    
    def fieldNames(self):
        """Return the list of field names in the examples of this dataset."""
        raise NotImplementedError

    def rename(*new_field_specifications):
        """
        Return a new dataset that maps old fields (of self) to new fields (of the returned 
        dataset). The minimal syntax that should be supported is the following:
           new_field_specifications = [new_field_spec1, new_field_spec2, ...]
           new_field_spec = ([old_field1, old_field2, ...], new_field)
        In general both old_field and new_field should be strings, but some datasets may also
        support additional indexing schemes within each field (e.g. column slice
        of a matrix-like field).
        """
        raise NotImplementedError

class FiniteDataSet(DataSet):
    """
    Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
    Examples are indexed by an integer between 0 and self.length()-1,
    and a subdataset can be obtained by slicing. This may not be appropriate in general
    but only for datasets which can be thought of like ones that access rows AND fields
    in an efficient random access way. Users are encouraged to expect only the generic dataset
    interface in general. A FiniteDataSet is mainly useful when one has to obtain
    a subset of examples (e.g. for splitting a dataset into training and test sets).
    """

    def __init__(self):
        pass

    def __iter__(self):
        return FiniteDataSetIterator(self)
    
    def zip(self,*fieldnames):
        return FiniteDataSetIterator(self,1,fieldnames)

    def minibatches(self,minibatch_size,*fieldnames):
        return FiniteDataSetIterator(self,minibatch_size,fieldnames)

    def __getattr__(self,fieldname):
        """Return an that can iterate over the values of the field in this dataset."""
        return self(fieldname)

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields.
        
        The return value's default iterator will iterate only over the given
        fields.
        """
        raise NotImplementedError

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        raise NotImplementedError
    
    def __getitem__(self,i):
        """dataset[i] returns the (i+1)-th example of the dataset."""
        raise NotImplementedError

    def __getslice__(self,*slice_args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        raise NotImplementedError

class FiniteDataSetIterator(object):
    """
    If the fieldnames list is empty, it means that we want to see ALL the fields.
    """
    def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
        self.dataset=dataset
        self.minibatch_size=minibatch_size
        assert minibatch_size>=1 and minibatch_size<=len(dataset)
        self.current = -self.minibatch_size
        self.fieldnames = fieldnames

    def __iter__(self):
        return self
    
    def next(self):
        self.current+=self.minibatch_size
        if self.current>=len(self.dataset):
            self.current=-self.minibatch_size
            raise StopIteration
        if self.minibatch_size==1:
            complete_example=self.dataset[self.current]
        else:
            complete_example=self.dataset[self.current:self.current+self.minibatch_size]
        if self.fieldnames:
            return Example(self.fieldnames,list(complete_example))
        else:
            return complete_example


# we may want ArrayDataSet defined in another python file

import numpy

class ArrayDataSet(FiniteDataSet):
    """
    An ArrayDataSet behaves like a numpy array but adds the notion of named fields
    from DataSet (and the ability to view multiple field values as an 'Example').
    It is a  fixed-length and fixed-width dataset 
    in which each element is a numpy array or a number, hence the whole 
    dataset corresponds to a numpy array. Fields
    must correspond to a slice of array columns. If the dataset has fields,
    each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
    Any dataset can also be converted to a numpy array (losing the notion of fields
    by the numpy.array(dataset) call.
    """

    def __init__(self,dataset=None,data=None,fields={}):
        """
        There are two ways to construct an ArrayDataSet: (1) from an
        existing dataset (which may result in a copy of the data in a numpy array),
        or (2) from a numpy.array (the data argument), along with an optional description
        of the fields (dictionary of column slices indexed by field names).
        """
        if dataset!=None:
            assert data==None and fields=={}
            # Make ONE big minibatch with all the examples, to separate the fields.
            n_examples=len(dataset)
            batch = dataset.minibatches(n_examples).next()
            # Each field of the underlying dataset must be convertible to a numpy array of the same type
            # currently just double, but should use the smallest compatible dtype
            n_fields = len(batch)
            fieldnames = batch.fields.keys()
            total_width = 0
            type = None
            for i in xrange(n_fields):
                field = array(batch[i])
                assert field.shape[0]==n_examples
                width = field.shape[1]
                start=total_width
                total_width += width
                fields[fieldnames[i]]=slice(start,total_width,1)
            # many complicated things remain to be done:
            #  - find common dtype
            #  - decide what to do with extra dimensions if not the same in all fields
            #  - try to see if we can avoid the copy?
            raise NotImplementedError
        if data!=None:
            assert dataset==None
            self.data=data
            self.fields=fields
            self.width = data.shape[1]
            for fieldname in fields:
                fieldslice=fields[fieldname]
                # make sure fieldslice.start and fieldslice.step are defined
                start=fieldslice.start
                step=fieldslice.step
                if not start:
                    start=0
                if not step:
                    step=1
                if not fieldslice.start or not fieldslice.step:
                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                # and coherent with the data array
                assert fieldslice.start>=0 and fieldslice.stop<=self.width

    def __getattr__(self,fieldname):
        """
        Return a numpy array with the content associated with the given field name.
        If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
        than the dataset itself) is returned.
        """
        if len(self.data)==1:
            return self.data[0,self.fields[fieldname]]
        return self.data[:,self.fields[fieldname]]

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields."""
        min_col=self.data.shape[1]
        max_col=0
        for field_slice in self.fields.values():
            min_col=min(min_col,field_slice.start)
            max_col=max(max_col,field_slice.stop)
        new_fields={}
        for field in self.fields:
            new_fields[field[0]]=slice(field[1].start-min_col,field[1].stop-min_col,field[1].step)
        return ArrayDataSet(data=self.data[:,min_col:max_col],fields=new_fields)

    def fieldNames(self):
        """Return the list of field names that are supported by getattr and getFields."""
        return self.fields.keys()

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        return len(self.data)
    
    def __getitem__(self,i):
        """
        dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
        the result is just a numpy array (for the i-th row of the dataset data matrix).
        """
        if self.fields:
            fieldnames,fieldslices=zip(*self.fields.items())
            return Example(fieldnames,[self.data[i,fieldslice] for fieldslice in fieldslices])
        else:
            return self.data[i]

    def __getslice__(self,*slice_args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        return ArrayDataSet(data=self.data[apply(slice,slice_args)],fields=self.fields)

    def __array__(self):
        """Return an view of this dataset which is an numpy.ndarray

        Numpy uses this special function name to retrieve an ndarray view for
        function such as numpy.sum, numpy.dot, numpy.asarray, etc.

        If this dataset has no fields, then we simply return self.data,
        otherwise things are complicated. 
        - why do we want this behaviour when there are fields? (JB)
        """
        if not self.fields:
            return self.data
        # else, select subsets of columns mapped by the fields
        columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
        for field_slice in self.fields.values():
            for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
                columns_used[c]=True
        # try to figure out if we can map all the slices into one slice:
        mappable_to_one_slice = True
        start=0
        while start<len(columns_used) and not columns_used[start]:
            start+=1
        stop=len(columns_used)
        while stop>0 and not columns_used[stop-1]:
            stop-=1
        step=0
        i=start
        while i<stop:
            j=i+1
            while j<stop and not columns_used[j]:
                j+=1
            if step:
                if step!=j-i:
                    mappable_to_one_slice = False
                    break
            else:
                step = j-i
            i=j
        if mappable_to_one_slice:
            return self.data[:,slice(start,stop,step)]
        # else make contiguous copy
        n_columns = sum(columns_used)
        result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
        print result.shape
        c=0
        for field_slice in self.fields.values():
            slice_width=field_slice.stop-field_slice.start/field_slice.step
            # copy the field here
            result[:,slice(c,slice_width)]=self.data[:,field_slice]
            c+=slice_width
        return result