view dataset.py @ 25:b63e8c0bf21b

added __init__.py, fixed crossentropy_softmax_1hot function name
author bergstrj@iro.umontreal.ca
date Thu, 10 Apr 2008 20:53:44 -0400
parents 526e192b0699
children 672fe4b23032
line wrap: on
line source


from lookup_list import LookupList
Example = LookupList

class AbstractFunction (Exception): """Derived class must override this function"""
        
class DataSet(object):
    """A virtual base class for datasets.

    A DataSet is a generator of iterators; these iterators can run through the
    examples in a variety of ways.  A DataSet need not necessarily have a finite
    or known length, so this class can be used to interface to a 'stream' which
    feeds on-line learning. 

    To iterate over examples, there are several possibilities:
    - for example in dataset.zip([field1, field2,field3, ...])
    - for val1,val2,val3 in dataset.zip([field1, field2,field3])
    - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N)
    - for example in dataset
    Each of these is documented below. All of these iterators are expected
    to provide, in addition to the usual 'next()' method, a 'next_index()' method
    which returns a non-negative integer pointing to the position of the next
    example that will be returned by 'next()' (or of the first example in the
    next minibatch returned). This is important because these iterators
    can wrap around the dataset in order to do multiple passes through it,
    in possibly unregular ways if the minibatch size is not a divisor of the
    dataset length.

    Note: Fields are not mutually exclusive, i.e. two fields can overlap in their actual content.

    Note: The content of a field can be of any type.

    Note: A dataset can recognize a potentially infinite number of field names (i.e. the field
    values can be computed on-demand, when particular field names are used in one of the
    iterators).

    Datasets of finite length should be sub-classes of FiniteLengthDataSet.

    Datasets whose elements can be indexed and sub-datasets of consecutive
    examples (i.e. slices) can be extracted from should be sub-classes of
    SliceableDataSet.

    Datasets with a finite number of fields should be sub-classes of
    FiniteWidthDataSet.
    """

    def __init__(self):
        pass
    
    class Iterator(LookupList):
        def __init__(self, ll):
            LookupList.__init__(self, ll.keys(), ll.values())
            self.ll = ll
        def __iter__(self): #makes for loop work
            return self
        def next(self):
            self.ll.next()
            self._values = [v[0] for v in self.ll._values]
            return self
        def next_index(self):
            return self.ll.next_index()

    def __iter__(self):
        """Supports the syntax "for i in dataset: ..."

        Using this syntax, "i" will be an Example instance (or equivalent) with
        all the fields of DataSet self.  Every field of "i" will give access to
        a field of a single example.  Fields should be accessible via
        i["fielname"] or i[3] (in the order defined by the elements of the
        Example returned by this iterator), but the derived class is free
        to accept any type of identifier, and add extra functionality to the iterator.
        """
        return DataSet.Iterator(self.minibatches(None, minibatch_size = 1))

    def zip(self, *fieldnames):
        """
        Supports two forms of syntax:

            for i in dataset.zip([f1, f2, f3]): ...

            for i1, i2, i3 in dataset.zip([f1, f2, f3]): ...

        Using the first syntax, "i" will be an indexable object, such as a list,
        tuple, or Example instance, such that on every iteration, i[0] is the f1
        field of the current example, i[1] is the f2 field, and so on.

        Using the second syntax, i1, i2, i3 will contain the the contents of the
        f1, f2, and f3 fields of a single example on each loop iteration.

        The derived class may accept fieldname arguments of any type.

        """
        return DataSet.Iterator(self.minibatches(fieldnames, minibatch_size = 1))

    minibatches_fieldnames = None
    minibatches_minibatch_size = 1
    minibatches_n_batches = None
    def minibatches(self,
            fieldnames = minibatches_fieldnames,
            minibatch_size = minibatches_minibatch_size,
            n_batches = minibatches_n_batches):
        """
        Supports three forms of syntax:

            for i in dataset.minibatches(None,**kwargs): ...

            for i in dataset.minibatches([f1, f2, f3],**kwargs): ...

            for i1, i2, i3 in dataset.minibatches([f1, f2, f3],**kwargs): ...

        Using the first two syntaxes, "i" will be an indexable object, such as a list,
        tuple, or Example instance. In both cases, i[k] is a list-like container
        of a batch of current examples. In the second case, i[0] is
        list-like container of the f1 field of a batch current examples, i[1] is
        a list-like container of the f2 field, etc.

        Using the first syntax, all the fields will be returned in "i".
        Beware that some datasets may not support this syntax, if the number
        of fields is infinite (i.e. field values may be computed "on demand").

        Using the third syntax, i1, i2, i3 will be list-like containers of the
        f1, f2, and f3 fields of a batch of examples on each loop iteration.

        PARAMETERS
        - fieldnames (list of any type, default None):
        The loop variables i1, i2, i3 (in the example above) should contain the
        f1, f2, and f3 fields of the current batch of examples.  If None, the
        derived class can choose a default, e.g. all fields.

        - minibatch_size (integer, default 1)
        On every iteration, the variables i1, i2, i3 will have
        exactly minibatch_size elements. e.g. len(i1) == minibatch_size

        - n_batches (integer, default None)
        The iterator will loop exactly this many times, and then stop.  If None,
        the derived class can choose a default.  If (-1), then the returned
        iterator should support looping indefinitely.

        Note: A list-like container is something like a tuple, list, numpy.ndarray or
        any other object that supports integer indexing and slicing.

        """
        raise AbstractFunction()

    def hasFields(*fieldnames):
        """
        Return true if the given field name (or field names, if multiple arguments are
        given) is recognized by the DataSet (i.e. can be used as a field name in one
        of the iterators).
        """
        raise AbstractFunction()
        
    def merge_fields(*specifications):
        """
        Return a new dataset that maps old fields (of self) to new fields (of the returned 
        dataset). The minimal syntax that should be supported is the following:
           new_field_specifications = [new_field_spec1, new_field_spec2, ...]
           new_field_spec = ([old_field1, old_field2, ...], new_field)
        In general both old_field and new_field should be strings, but some datasets may also
        support additional indexing schemes within each field (e.g. column slice
        of a matrix-like field).
        """
        raise AbstractFunction()

    def merge_field_values(*field_value_pairs)
        """
        Return the value that corresponds to merging the values of several fields,
        given as arguments (field_name, field_value) pairs with self.hasField(field_name).
        This may be used by implementations of merge_fields.
        Raise a ValueError if the operation is not possible.
        """
        fieldnames,fieldvalues = zip(*field_value_pairs)
        raise ValueError("Unable to merge values of these fields:"+repr(fieldnames))

    def examples2minibatch(examples):
        """
        Combine a list of Examples into a minibatch. A minibatch is an Example whose fields
        are iterable over the examples of the minibatch.
        """
        raise AbstractFunction()
    
    def rename(rename_dict):
        """
        Return a new dataset that renames fields, using a dictionnary that maps old field
        names to new field names. The only fields visible by the returned dataset are those
        whose names are keys of the rename_dict.
        """
        return RenamingDataSet(self,rename_dict)

    def applyFunction(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
        """
        Return a dataset that contains as fields the results of applying
        the given function (example-wise) to the specified input_fields. The
        function should return a sequence whose elements will be stored in
        fields whose names are given in the output_fields list. If copy_inputs
        is True then the resulting dataset will also contain the fields of self.
        If accept_minibatches, then the function may be called
        with minibatches as arguments (what is returned by the minibatches
        iterator). In any case, the computations may be delayed until the examples
        of the resulting dataset are requested. If cache is True, then
        once the output fields for some examples have been computed, then
        are cached (to avoid recomputation if the same examples are again
        requested).
        """
        return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache)

class RenamingDataSet(DataSet):
    """A DataSet that wraps another one, and makes it look like the field names
    are different

    Renaming is done by a dictionary that maps new names to the old ones used in
    self.src.
    """
    def __init__(self, src, rename_dct):
        DataSet.__init__(self)
        self.src = src
        self.rename_dct = copy.copy(rename_dct)

    def minibatches(self,
            fieldnames = DataSet.minibatches_fieldnames,
            minibatch_size = DataSet.minibatches_minibatch_size,
            n_batches = DataSet.minibatches_n_batches):
        dct = self.rename_dct
        new_fieldnames = [dct.get(f, f) for f in fieldnames]
        return self.src.minibatches(new_fieldnames, minibatches_size, n_batches)

class FiniteLengthDataSet(DataSet):
    """
    Virtual interface for datasets that have a finite length (number of examples),
    and thus recognize a len(dataset) call.
    """
    def __init__(self):
        DataSet.__init__(self)

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        raise AbstractFunction()
    
                 
class SliceableDataSet(DataSet):
    """
    Virtual interface, a subclass of DataSet for datasets which are sliceable
    and whose individual elements can be accessed, generally respecting the
    python semantics for [spec], where spec is either a non-negative integer
    (for selecting one example), or a python slice (for selecting a sub-dataset
    comprising the specified examples). This is useful for obtaining
    sub-datasets, e.g. for splitting a dataset into training and test sets.
    """
    def __init__(self):
        DataSet.__init__(self)
        
    def minibatches(self,
            fieldnames = DataSet.minibatches_fieldnames,
            minibatch_size = DataSet.minibatches_minibatch_size,
            n_batches = DataSet.minibatches_n_batches):
        """
        If the n_batches is empty, we want to see all the examples possible
        for the given minibatch_size (possibly missing a few at the end of the dataset).
        """
        # substitute the defaults:
        if n_batches is None: n_batches = len(self) / minibatch_size
        return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)

    def __getitem__(self,i):
        """dataset[i] returns the (i+1)-th example of the dataset."""
        raise AbstractFunction()

    def __getslice__(self,*slice_args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        raise AbstractFunction()


class FiniteWidthDataSet(DataSet):
    """
    Virtual interface for datasets that have a finite width (number of fields),
    and thus return a list of fieldNames. 
    """
    def __init__(self):
        DataSet.__init__(self)

    def hasFields(*fieldnames):
        has_fields=True
        for fieldname in fieldnames:
            if fieldname not in self.fields.keys():
                has_fields=False
        return has_fields
                
    def fieldNames(self):
        """Return the list of field names that are supported by the iterators,
        and for which hasFields(fieldname) would return True."""
        raise AbstractFunction()


# we may want ArrayDataSet defined in another python file

import numpy

def as_array_dataset(dataset):
    # Generally datasets can be efficient by making data fields overlap, but
    # this function doesn't know which fields overlap.  So, it should check if
    # dataset supports an as_array_dataset member function, and return that if
    # possible.
    if hasattr(dataset, 'as_array_dataset'):
        return dataset.as_array_dataset()

    raise NotImplementedError

    # Make ONE big minibatch with all the examples, to separate the fields.
    n_examples = len(dataset)
    batch = dataset.minibatches( minibatch_size = len(dataset)).next()

    # Each field of the underlying dataset must be convertible to a numpy array of the same type
    # currently just double, but should use the smallest compatible dtype
    n_fields = len(batch)
    fieldnames = batch.fields.keys()
    total_width = 0
    type = None
    fields = LookupList()
    for i in xrange(n_fields):
        field = array(batch[i])
        assert field.shape[0]==n_examples
        width = field.shape[1]
        start=total_width
        total_width += width
        fields[fieldnames[i]]=slice(start,total_width,1)
    # many complicated things remain to be done:
    #  - find common dtype
    #  - decide what to do with extra dimensions if not the same in all fields
    #  - try to see if we can avoid the copy?

class ArrayDataSet(FiniteLengthDataSet,FiniteWidthDataSet,SliceableDataSet):
    """
    An ArrayDataSet behaves like a numpy array but adds the notion of named fields
    from DataSet (and the ability to view the values of multiple fields as an 'Example').
    It is a  fixed-length and fixed-width dataset 
    in which each element is a fixed dimension numpy array or a number, hence the whole 
    dataset corresponds to a numpy array. Fields
    must correspond to a slice of array columns. If the dataset has fields,
    each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
    Any dataset can also be converted to a numpy array (losing the notion of fields
    by the numpy.array(dataset) call.
    """

    class Iterator(LookupList):
        """An iterator over a finite dataset that implements wrap-around"""
        def __init__(self, dataset, fieldnames, minibatch_size, next_max):
            if fieldnames is None: fieldnames = dataset.fieldNames()
            LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
            self.dataset=dataset
            self.minibatch_size=minibatch_size
            self.next_count = 0
            self.next_max = next_max
            self.current = -self.minibatch_size
            assert minibatch_size > 0
            if minibatch_size >= len(dataset):
                raise NotImplementedError()

        def __iter__(self): #makes for loop work
            return self

        @staticmethod
        def matcat(a, b):
            a0, a1 = a.shape
            b0, b1 = b.shape
            assert a1 == b1
            assert a.dtype is b.dtype
            rval = numpy.empty( (a0 + b0, a1), dtype=a.dtype)
            rval[:a0,:] = a
            rval[a0:,:] = b
            return rval

        def next_index(self):
            n_rows = self.dataset.data.shape[0]
            next_i = self.current+self.minibatch_size
            if next_i >= n_rows:
                next_i -= n_rows
            return next_i
        
        def next(self):

            #check for end-of-loop
            self.next_count += 1
            if self.next_count == self.next_max:
                raise StopIteration

            #determine the first and last elements of the slice we'll return
            n_rows = self.dataset.data.shape[0]
            self.current = self.next_index()
            upper = self.current + self.minibatch_size

            data = self.dataset.data

            if upper <= n_rows:
                #this is the easy case, we only need once slice
                dataview = data[self.current:upper]
            else:
                # the minibatch wraps around the end of the dataset
                dataview = data[self.current:]
                upper -= n_rows
                assert upper > 0
                dataview = self.matcat(dataview, data[:upper])

            self._values = [dataview[:, self.dataset.fields[f]]\
                    for f in self._names]
            return self


    def __init__(self, data, fields=None):
        """
        There are two ways to construct an ArrayDataSet: (1) from an
        existing dataset (which may result in a copy of the data in a numpy array),
        or (2) from a numpy.array (the data argument), along with an optional description
        of the fields (a LookupList of column slices indexed by field names).
        """
        self.data=data
        self.fields=fields
        rows, cols = data.shape

        if fields:
            for fieldname,fieldslice in fields.items():
                # make sure fieldslice.start and fieldslice.step are defined
                start=fieldslice.start
                step=fieldslice.step
                if not start:
                    start=0
                if not step:
                    step=1
                if not fieldslice.start or not fieldslice.step:
                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
                # and coherent with the data array
                assert fieldslice.start >= 0 and fieldslice.stop <= cols

    def minibatches(self,
            fieldnames = DataSet.minibatches_fieldnames,
            minibatch_size = DataSet.minibatches_minibatch_size,
            n_batches = DataSet.minibatches_n_batches):
        """
        If the fieldnames list is None, it means that we want to see ALL the fields.

        If the n_batches is None, we want to see all the examples possible
        for the given minibatch_size (possibly missing some near the end).
        """
        # substitute the defaults:
        if n_batches is None: n_batches = len(self) / minibatch_size
        return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)

    def __getattr__(self,fieldname):
        """
        Return a numpy array with the content associated with the given field name.
        If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
        than the dataset itself) is returned.
        """
        if len(self.data)==1:
            return self.data[0,self.fields[fieldname]]
        return self.data[:,self.fields[fieldname]]

    def __call__(self,*fieldnames):
        """Return a sub-dataset containing only the given fieldnames as fields."""
        min_col=self.data.shape[1]
        max_col=0
        for field_slice in self.fields.values():
            min_col=min(min_col,field_slice.start)
            max_col=max(max_col,field_slice.stop)
        new_fields=LookupList()
        for fieldname,fieldslice in self.fields.items():
            new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
        return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)

    def fieldNames(self):
        """Return the list of field names that are supported by getattr and hasField."""
        return self.fields.keys()

    def __len__(self):
        """len(dataset) returns the number of examples in the dataset."""
        return len(self.data)
    
    def __getitem__(self,i):
        """
        dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
        the result is just a numpy array (for the i-th row of the dataset data matrix).
        """
        if self.fields:
            fieldnames,fieldslices=zip(*self.fields.items())
            return Example(self.fields.keys(),[self.data[i,fieldslice] for fieldslice in self.fields.values()])
        else:
            return self.data[i]

    def __getslice__(self,*args):
        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
        return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)

    def __array__(self):
        """Return a view of this dataset which is an numpy.ndarray (i.e. losing
        the identity and name of fields within the dataset).

        Numpy uses this special function name to retrieve an ndarray view for
        function such as numpy.sum, numpy.dot, numpy.asarray, etc.

        If this dataset has no fields, then we simply return self.data,
        otherwise things are complicated. 
        - why do we want this behaviour when there are fields? (JB)
        - for convenience and completeness (but maybe it would make
          more sense to implement this through a 'field-merging'
          dataset). (YB)
        """
        if not self.fields:
            return self.data
        # else, select subsets of columns mapped by the fields
        columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
        overlapping_fields = False
        n_columns = 0
        for field_slice in self.fields.values():
            for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
                n_columns += 1
                if columns_used[c]: overlapping_fields=True
                columns_used[c]=True
        # try to figure out if we can map all the slices into one slice:
        mappable_to_one_slice = not overlapping_fields
        if not overlapping_fields:
            start=0
            while start<len(columns_used) and not columns_used[start]:
                start+=1
            stop=len(columns_used)
            while stop>0 and not columns_used[stop-1]:
                stop-=1
            step=0
            i=start
            while i<stop:
                j=i+1
                while j<stop and not columns_used[j]:
                    j+=1
                if step:
                    if step!=j-i:
                        mappable_to_one_slice = False
                        break
                else:
                    step = j-i
                i=j
        if mappable_to_one_slice:
            return self.data[:,slice(start,stop,step)]
        # else make contiguous copy (copying the overlapping columns)
        result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
        c=0
        for field_slice in self.fields.values():
            slice_width=(field_slice.stop-field_slice.start)/field_slice.step
            # copy the field here
            result[:,slice(c,c+slice_width)]=self.data[:,field_slice]
            c+=slice_width
        return result

    def rename(*new_field_specifications):
        """
        Return a new dataset that maps old fields (of self) to new fields (of the returned 
        dataset). The minimal syntax that should be supported is the following:
           new_field_specifications = [new_field_spec1, new_field_spec2, ...]
           new_field_spec = ([old_field1, old_field2, ...], new_field)
        In general both old_field and new_field should be strings, but some datasets may also
        support additional indexing schemes within each field (e.g. column slice
        of a matrix-like field).
        """
        # if all old fields of each spec are 
        raise NotImplementedError()

class ApplyFunctionDataSet(DataSet):
    """
    A dataset that contains as fields the results of applying
    a given function (example-wise) to specified input_fields of a source
    dataset. The function should return a sequence whose elements will be stored in
    fields whose names are given in the output_fields list. If copy_inputs
    is True then the resulting dataset will also contain the fields of the source.
    dataset. If accept_minibatches, then the function expects 
    minibatches as arguments (what is returned by the minibatches
    iterator). In any case, the computations may be delayed until the examples
    of self are requested. If cache is True, then
    once the output fields for some examples have been computed, then
    are cached (to avoid recomputation if the same examples are again requested).
    """
    def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True, compute_now=False):
        DataSet.__init__(self)
        self.src=src
        self.function=function
        assert src.hasFields(input_fields)
        self.input_fields=input_fields
        self.output_fields=output_fields
        assert not (copy_inputs and compute_now and not hasattr(src,'fieldNames'))
        self.copy_inputs=copy_inputs
        self.accept_minibatches=accept_minibatches
        self.cache=cache
        self.compute_now=compute_now
        if compute_now:
            assert hasattr(src,'__len__') and len(src)>=0
            fieldnames = output_fields
            if copy_inputs: fieldnames = src.fieldNames() + output_fields
            if accept_minibatches:
                # make a single minibatch with all the inputs
                inputs = src.minibatches(input_fields,len(src)).next()
                # and apply the function to it, and transpose into a list of examples (field values, actually)
                self.cached_examples = zip(*Example(output_fields,function(*inputs)))
            else:
                # compute a list with one tuple per example, with the function outputs
                self.cached_examples = [ function(input) for input in src.zip(input_fields) ]
        else if cache:
            # maybe a fixed-size array kind of structure would be more efficient than a list
            # in the case where src is FiniteDataSet. -YB
            self.cached_examples = []

    def minibatches(self,
                    fieldnames = DataSet.minibatches_fieldnames,
                    minibatch_size = DataSet.minibatches_minibatch_size,
                    n_batches = DataSet.minibatches_n_batches):
        
        class Iterator(LookupList):

            def __init__(self,dataset):
                if fieldnames is None:
                    assert hasattr(dataset,"fieldNames")
                    fieldnames = dataset.fieldNames()
                self.example_index=0
                LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
                self.dataset=dataset
                self.src_iterator=self.src.minibatches(list(set.union(set(fieldnames),set(dataset.input_fields))),
                                                       minibatch_size,n_batches)
                self.fieldnames_not_in_input = []
                if self.copy_inputs:
                    self.fieldnames_not_in_input = filter(lambda x: not x in dataset.input_fields, fieldnames)
                                                       
            def __iter__(self):
                return self

            def next_index(self):
                return self.src_iterator.next_index()
            
            def next(self):
                example_index = self.src_iterator.next_index()
                src_examples = self.src_iterator.next()
                if self.dataset.copy_inputs:
                    function_inputs = [src_examples[field_name] for field_name in self.dataset.input_fields]
                else:
                    function_inputs = src_examples
                if self.dataset.cached_examples:
                    cache_len=len(self.cached_examples)
                    if example_index<cache_len+minibatch_size:
                        outputs_list = self.cached_examples[example_index:example_index+minibatch_size]
                        # convert the minibatch list of examples 
                        # into a list of fields each of which iterate over the minibatch
                        outputs = zip(*outputs_list)
                    else:
                        outputs = self.dataset.function(*function_inputs)
                        if self.dataset.cache:
                            # convert the list of fields, each of which can iterate over the minibatch
                            # into a list of examples in the minibatch (each of which is a list of field values)
                            outputs_list = zip(*outputs)
                            # copy the outputs_list into the cache
                            for i in xrange(cache_len,example_index):
                                self.cached_examples.append(None)
                            self.cached_examples += outputs_list
                else:
                    outputs = self.dataset.function(*function_inputs)
                
                return Example(self.fieldnames_not_in_input+self.dataset.output_fields,
                               [src_examples[field_name] for field_name in self.fieldnames_not_in_input]+outputs)
                               

        for fieldname in fieldnames:
            assert fieldname in self.output_fields or self.src.hasFields(fieldname)
        return Iterator(self)

    
def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
    """
    Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the
    user to define a set of fields as the 'input' field and a set of fields
    as the 'target' field. Optionally, a single weight_field can also be defined.
    """
    args = ((input_fields,'input'),(output_fields,'target'))
    if weight_field: args+=(([weight_field],'weight'))
    return src_dataset.rename(*args)