view dataset.py @ 17:759d17112b23

more comments, looping ArrayDataSet iterator, bugfixes to lookup_list, more tests
author bergstrj@iro.umontreal.ca
date Wed, 26 Mar 2008 21:05:14 -0400
parents 813723310d75 ff4e551490f1
children 57f4015e2e09
line wrap: on
line source


from lookup_list import LookupList
Example = LookupList

class AbstractFunction (Exception): """Derived class must override this function"""
        
class DataSet(object):
    """A virtual base class for datasets.

    A DataSet is a generator of iterators; these iterators can run through the
    examples in a variety of ways.  A DataSet need not necessarily have a finite
    or known length, so this class can be used to interface to a 'stream' which
    feed on-line learning.

    To iterate over examples, there are several possibilities:
    - for i in dataset.zip(field1, field2,field3, ...)
    - for i in dataset.minibatches(N, field1, field2, ...)
    - for i in dataset
    Each of these is documented below.

    Note: For a dataset of fixed and known length, which can implement item
    random-access efficiently (e.g. indexing and slicing), and which can profit
    from the FiniteDataSetIterator, consider using base class FiniteDataSet.

    Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.

    Note: The content of a field can be of any type.

    """

    def __init__(self):
        pass
    
    def __iter__(self):
        """Supports the syntax "for i in dataset: ..."

        Using this syntax, "i" will be an Example instance (or equivalent) with
        all the fields of DataSet self.  Every field of "i" will give access to
        a the field of a single example.  Fields should be accessible via
        i[identifier], but the derived class is free to accept any type of
        identifier, and add extra functionality to the iterator.
        """
        for i in self.minibatches( minibatch_size = 1):
            yield Example(i.keys(), [v[0] for v in i.values()])

    def zip(self, *fieldnames):
        """
        Supports two forms of syntax:

            for i in dataset.zip(f1, f2, f3): ...

            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...

        Using the first syntax, "i" will be an indexable object, such as a list,
        tuple, or Example instance, such that on every iteration, i[0] is the f1
        field of the current example, i[1] is the f2 field, and so on.

        Using the second syntax, i1, i2, i3 will contain the the contents of the
        f1, f2, and f3 fields of a single example on each loop iteration.

        The derived class may accept fieldname arguments of any type.

        """
        for i in self.minibatches(fieldnames, minibatch_size = 1):
            yield [f[0] for f in i]

    minibatches_fieldnames = None
    minibatches_minibatch_size = 1
    minibatches_n_batches = None
    def minibatches(self,
            fieldnames = minibatches_fieldnames,
            minibatch_size = minibatches_minibatch_size,
            n_batches = minibatches_n_batches):
        """
        Supports two forms of syntax:

            for i in dataset.minibatches([f1, f2, f3],**kwargs): ...

            for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...

        Using the first syntax, "i" will be an indexable object, such as a list,
        tuple, or Example instance, such that on every iteration, i[0] is a
        list-like container of the f1 field of a batch current examples, i[1] is
        a list-like container of the f2 field, etc.

        Using the second syntax, i1, i2, i3 will be list-like containers of the
        f1, f2, and f3 fields of a batch of examples on each loop iteration.

        PARAMETERS
        - fieldnames (list of any type, default None):
        The loop variables i1, i2, i3 (in the example above) should contain the
        f1, f2, and f3 fields of the current batch of examples.  If None, the
        derived class can choose a default, e.g. all fields.

        - minibatch_size (integer, default 1)
        On every iteration, the variables i1, i2, i3 will have
        exactly minibatch_size elements. e.g. len(i1) == minibatch_size

        - n_batches (integer, default None)
        The iterator will loop exactly this many times, and then stop.  If None,
        the derived class can choose a default.  If (-1), then the returned
        iterator should support looping indefinitely.

        Note: A list-like container is something like a tuple, list, numpy.ndarray or
        any other object that supports integer indexing and slicing.

        """
        raise AbstractFunction()
    
    def fieldNames(self):
        #Yoshua- 
        # This list may not be finite; what would make sense in the use you have
        # in mind?
        # -JB
        """Return the list of field names in the examples of this dataset."""
        raise AbstractFunction()

    def rename(*new_field_specifications):
        #Yoshua- 
        # Do you mean for this to be a virtual method?
        # Wouldn't this functionality be easier to provide via a
        # RenamingDataSet, such as the one I've written below?
        # -JB
        """
        Return a new dataset that maps old fields (of self) to new fields (of the returned 
        dataset). The minimal syntax that should be supported is the following:
           new_field_specifications = [new_field_spec1, new_field_spec2, ...]
           new_field_spec = ([old_field1, old_field2, ...], new_field)
        In general both old_field and new_field should be strings, but some datasets may also
        support additional indexing schemes within each field (e.g. column slice
        of a matrix-like field).
        """
        raise AbstractFunction()

class RenamingDataSet(DataSet):
    """A DataSet that wraps another one, and makes it look like the field names
    are different

    Renaming is done by a dictionary that maps new names to the old ones used in
    self.src.
    """
    def __init__(self, src, rename_dct):
        DataSet.__init__(self)
        self.src = src
        self.rename_dct = copy.copy(rename_dct)

    def minibatches(self,
            fieldnames = DataSet.minibatches_fieldnames,
            minibatch_size = DataSet.minibatches_minibatch_size,
            n_batches = DataSet.minibatches_n_batches):
        dct = self.rename_dct
        new_fieldnames = [dct.get(f, f) for f in fieldnames]
        return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)

    def fieldNames(self):
        return [dct.get(f, f) for f in self.src.fieldNames()]


class FiniteDataSet(DataSet):
    """
    Virtual interface, a subclass of DataSet for datasets which have a finite, known length.
    Examples are indexed by an integer between 0 and self.length()-1,
    and a subdataset can be obtained by slicing. This may not be appropriate in general
    but only for datasets which can be thought of like ones that access rows AND fields
    in an efficient random access way. Users are encouraged to expect only the generic dataset
    interface in general. A FiniteDataSet is mainly useful when one has to obtain
    a subset of examples (e.g. for splitting a dataset into training and test sets).
    """

    class FiniteDataSetIterator(object):
        """
        If the fieldnames list is empty, it means that we want to see ALL the fields.
        """
        def __init__(self,dataset,minibatch_size=1,fieldnames=[]):
            self.dataset=dataset
            self.minibatch_size=minibatch_size
            assert minibatch_size>=1 and minibatch_size<=len(dataset)
            self.current = -self.minibatch_size
            self.fieldnames = fieldnames

        def __iter__(self):
            return self
        
        def next(self):
            self.current+=self.minibatch_size
            if self.current>=len(self.dataset):
                self.current=-self.minibatch_size
                raise StopIteration
            if self.minibatch_size==1:
                complete_example=self.dataset[self.current]
            else:
                complete_example=self.dataset[self.current:self.current+self.minibatch_size]
            if self.fieldnames:
                return Example(self.fieldnames,list(complete_example))
            else:
                return complete_example

    def __init__(self):
        pass

    def minibatches(self,
            fieldnames = DataSet.minibatches_fieldnames,
            minibatch_size = DataSet.minibatches_minibatch_size,
            n_batches = DataSet.minibatches_n_batches):
        """
        If the fieldnames list is empty, it means that we want to see ALL the fields.

        If the n_batches is empty, we want to see all the examples possible
        for the give minibatch_size.
        """
        # substitute the defaults:
        if fieldnames is None: fieldnames = self.fieldNames()
        if n_batches is None: n_batches = len(self) / minibatch_size
        return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)

    def __getattr__(self,fieldname):
        """Return an that can iterate over the values of the field in this dataset."""
        return self(fieldname)

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields.
        
        The return value's default iterator will iterate only over the given
        fields.
        """
        raise AbstractFunction()

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        raise AbstractFunction()
    
    def __getitem__(self,i):
        """dataset[i] returns the (i+1)-th example of the dataset."""
        raise AbstractFunction()

    def __getslice__(self,*slice_args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        raise AbstractFunction()

# we may want ArrayDataSet defined in another python file

import numpy

def as_array_dataset(dataset):
    # Generally datasets can be efficient by making data fields overlap, but
    # this function doesn't know which fields overlap.  So, it should check if
    # dataset supports an as_array_dataset member function, and return that if
    # possible.
    if hasattr(dataset, 'as_array_dataset'):
        return dataset.as_array_dataset()

    raise NotImplementedError()

    # Make ONE big minibatch with all the examples, to separate the fields.
    n_examples = len(dataset)
    batch = dataset.minibatches( minibatch_size = len(dataset)).next()

    # Each field of the underlying dataset must be convertible to a numpy array of the same type
    # currently just double, but should use the smallest compatible dtype
    n_fields = len(batch)
    fieldnames = batch.fields.keys()
    total_width = 0
    type = None
    fields = LookupList()
    for i in xrange(n_fields):
        field = array(batch[i])
        assert field.shape[0]==n_examples
        width = field.shape[1]
        start=total_width
        total_width += width
        fields[fieldnames[i]]=slice(start,total_width,1)
    # many complicated things remain to be done:
    #  - find common dtype
    #  - decide what to do with extra dimensions if not the same in all fields
    #  - try to see if we can avoid the copy?

class ArrayDataSet(FiniteDataSet):
    """
    An ArrayDataSet behaves like a numpy array but adds the notion of named fields
    from DataSet (and the ability to view multiple field values as an 'Example').
    It is a  fixed-length and fixed-width dataset 
    in which each element is a numpy array or a number, hence the whole 
    dataset corresponds to a numpy array. Fields
    must correspond to a slice of array columns. If the dataset has fields,
    each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
    Any dataset can also be converted to a numpy array (losing the notion of fields
    by the numpy.array(dataset) call.
    """

    class Iterator(object):
        """An iterator over a finite dataset that implements wrap-around"""
        def __init__(self, dataset, fieldnames, minibatch_size, next_max):
            self.dataset=dataset
            self.fieldnames = fieldnames
            self.minibatch_size=minibatch_size
            self.next_count = 0
            self.next_max = next_max
            self.current = -self.minibatch_size
            assert minibatch_size > 0
            if minibatch_size >= len(dataset):
                raise NotImplementedError()

        def __iter__(self):
            #Why do we do this?  -JB
            return self

        @staticmethod
        def matcat(a, b):
            a0, a1 = a.shape
            b0, b1 = b.shape
            assert a1 == b1
            assert a.dtype is b.dtype
            rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
            rval[:a0,:] = a
            rval[a0:,:] = b
            return rval
        
        def next(self):

            #check for end-of-loop
            self.next_count += 1
            if self.next_count == self.next_max:
                raise StopIteration

            #determine the first and last elements of the slice we'll return
            self.current += self.minibatch_size
            if self.current >= len(self.dataset):
                self.current -= len(self.dataset)
            upper = self.current + self.minibatch_size

            if upper <= len(self.dataset):
                #this is the easy case, we only need once slice
                dataview = self.dataset.data[self.current:upper]
            else:
                # the minibatch wraps around the end of the dataset
                dataview = self.dataset.data[self.current:]
                upper -= len(self.dataset)
                assert upper > 0
                dataview = self.matcat(dataview, self.dataset.data[:upper])


            rval = [dataview[:, self.dataset.fields[f]] for f in self.fieldnames]

            if self.fieldnames:
                rval = Example(self.fieldnames, rval)

            return rval


    def __init__(self, data, fields=None):
        """
        There are two ways to construct an ArrayDataSet: (1) from an
        existing dataset (which may result in a copy of the data in a numpy array),
        or (2) from a numpy.array (the data argument), along with an optional description
        of the fields (a LookupList of column slices indexed by field names).
        """
        self.data=data
        self.fields=fields
        rows, cols = data.shape

        if fields:
            for fieldname,fieldslice in fields.items():
                # make sure fieldslice.start and fieldslice.step are defined
                start=fieldslice.start
                step=fieldslice.step
                if not start:
                    start=0
                if not step:
                    step=1
                if not fieldslice.start or not fieldslice.step:
                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                # and coherent with the data array
                assert fieldslice.start >= 0 and fieldslice.stop <= cols

    def minibatches(self,
            fieldnames = DataSet.minibatches_fieldnames,
            minibatch_size = DataSet.minibatches_minibatch_size,
            n_batches = DataSet.minibatches_n_batches):
        """
        If the fieldnames list is empty, it means that we want to see ALL the fields.

        If the n_batches is empty, we want to see all the examples possible
        for the give minibatch_size.
        """
        # substitute the defaults:
        if fieldnames is None: fieldnames = self.fieldNames()
        if n_batches is None: n_batches = len(self) / minibatch_size
        return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)

    def __getattr__(self,fieldname):
        """
        Return a numpy array with the content associated with the given field name.
        If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
        than the dataset itself) is returned.
        """
        if len(self.data)==1:
            return self.data[0,self.fields[fieldname]]
        return self.data[:,self.fields[fieldname]]

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields."""
        min_col=self.data.shape[1]
        max_col=0
        for field_slice in self.fields.values():
            min_col=min(min_col,field_slice.start)
            max_col=max(max_col,field_slice.stop)
        new_fields=LookupList()
        for fieldname,fieldslice in self.fields.items():
            new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
        return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)

    def fieldNames(self):
        """Return the list of field names that are supported by getattr and getFields."""
        return self.fields.keys()

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        return len(self.data)
    
    def __getitem__(self,i):
        """
        dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
        the result is just a numpy array (for the i-th row of the dataset data matrix).
        """
        if self.fields:
            fieldnames,fieldslices=zip(*self.fields.items())
            return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
        else:
            return self.data[i]

    def __getslice__(self,*args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)

    def __array__(self):
        """Return an view of this dataset which is an numpy.ndarray

        Numpy uses this special function name to retrieve an ndarray view for
        function such as numpy.sum, numpy.dot, numpy.asarray, etc.

        If this dataset has no fields, then we simply return self.data,
        otherwise things are complicated. 
        - why do we want this behaviour when there are fields? (JB)
        """
        if not self.fields:
            return self.data
        # else, select subsets of columns mapped by the fields
        columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
        for field_slice in self.fields.values():
            for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
                columns_used[c]=True
        # try to figure out if we can map all the slices into one slice:
        mappable_to_one_slice = True
        start=0
        while start<len(columns_used) and not columns_used[start]:
            start+=1
        stop=len(columns_used)
        while stop>0 and not columns_used[stop-1]:
            stop-=1
        step=0
        i=start
        while i<stop:
            j=i+1
            while j<stop and not columns_used[j]:
                j+=1
            if step:
                if step!=j-i:
                    mappable_to_one_slice = False
                    break
            else:
                step = j-i
            i=j
        if mappable_to_one_slice:
            return self.data[:,slice(start,stop,step)]
        # else make contiguous copy
        n_columns = sum(columns_used)
        result = zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
        print result.shape
        c=0
        for field_slice in self.fields.values():
            slice_width=field_slice.stop-field_slice.start/field_slice.step
            # copy the field here
            result[:,slice(c,slice_width)]=self.data[:,field_slice]
            c+=slice_width
        return result