changeset 273:fa8abc813bd2

Automated merge with ssh://projects@lgcm.iro.umontreal.ca/hg/pylearn
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Thu, 05 Jun 2008 11:47:44 -0400
parents fdce496c3b56 (diff) 6226ebafefc3 (current diff)
children ed70580f2324
files dataset.py
diffstat 5 files changed, 337 insertions(+), 101 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/amat.py	Thu Jun 05 11:47:44 2008 -0400
@@ -0,0 +1,123 @@
+"""load PLearn AMat files"""
+
+import sys, numpy, array
+
+path_MNIST = '/u/bergstrj/pub/data/mnist.amat'
+
+
+class AMat:
+    """DataSource to access a plearn amat file as a periodic unrandomized stream.
+
+    Attributes:
+
+    input -- minibatch of input
+    target -- minibatch of target
+    weight -- minibatch of weight
+    extra -- minitbatch of extra
+
+    all -- the entire data contents of the amat file
+    n_examples -- the number of training examples in the file
+
+    AMat stands for Ascii Matri[x,ces]
+
+    """
+
+    marker_size = '#size:'
+    marker_sizes = '#sizes:'
+    marker_col_names = '#:'
+
+    def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout):
+
+        """Load the amat at <path> into memory.
+        
+        path - str: location of amat file
+        head - int: stop reading after this many data rows
+        update_interval - int: print '.' to ofile every <this many> lines
+        ofile - file: print status, msgs, etc. to this file
+
+        """
+        self.all = None
+        self.input = None
+        self.target = None
+        self.weight = None
+        self.extra = None
+
+        self.header = False
+        self.header_size = None
+        self.header_rows = None
+        self.header_cols = None
+        self.header_sizes = None
+        self.header_col_names = []
+
+        data_started = False
+        data = array.array('d')
+        
+        f = open(path)
+        n_data_lines = 0
+        len_float_line = None
+
+        for i,line in enumerate(f):
+            if n_data_lines == head:
+                #we've read enough data, 
+                # break even if there's more in the file
+                break
+            if len(line) == 0 or line == '\n':
+                continue
+            if line[0] == '#':
+                if not data_started:
+                    #the condition means that the file has a header, and we're on 
+                    # some header line
+                    self.header = True
+                    if line.startswith(AMat.marker_size):
+                        info = line[len(AMat.marker_size):]
+                        self.header_size = [int(s) for s in info.split()]
+                        self.header_rows, self.header_cols = self.header_size
+                    if line.startswith(AMat.marker_col_names):
+                        info = line[len(AMat.marker_col_names):]
+                        self.header_col_names = info.split()
+                    elif line.startswith(AMat.marker_sizes):
+                        info = line[len(AMat.marker_sizes):]
+                        self.header_sizes = [int(s) for s in info.split()]
+            else:
+                #the first non-commented line tells us that the header is done
+                data_started = True
+                float_line = [float(s) for s in line.split()]
+                if len_float_line is None:
+                    len_float_line = len(float_line)
+                    if (self.header_cols is not None) \
+                            and self.header_cols != len_float_line:
+                        print >> sys.stderr, \
+                                'WARNING: header declared %i cols but first line has %i, using %i',\
+                                self.header_cols, len_float_line, len_float_line
+                else:
+                    if len_float_line != len(float_line):
+                        raise IOError('wrong line length', i, line)
+                data.extend(float_line)
+                n_data_lines += 1
+
+                if update_interval > 0 and (ofile is not None) \
+                        and n_data_lines % update_interval == 0:
+                    ofile.write('.')
+                    ofile.flush()
+
+        if update_interval > 0:
+            ofile.write('\n')
+        f.close()
+
+        # convert from array.array to numpy.ndarray
+        nshape = (len(data) / len_float_line, len_float_line)
+        self.all = numpy.frombuffer(data).reshape(nshape)
+        self.n_examples = self.all.shape[0]
+
+        # assign
+        if self.header_sizes is not None:
+            if len(self.header_sizes) > 4:
+                print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path
+            leftmost = 0
+            #here we make use of the fact that if header_sizes has len < 4
+            # the loop will exit before 4 iterations
+            attrlist = ['input', 'target', 'weight', 'extra']
+            for attr, ncols in zip(attrlist, self.header_sizes): 
+                setattr(self, attr, self.all[:, leftmost:leftmost+ncols])
+                leftmost += ncols
+
--- a/dataset.py	Tue Jun 03 16:13:42 2008 -0400
+++ b/dataset.py	Thu Jun 05 11:47:44 2008 -0400
@@ -109,10 +109,6 @@
 
      - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in.
 
-     - dataset[fieldname] an iterable over the values of the field fieldname across
-     the dataset (the iterable is obtained by default by calling valuesVStack
-     over the values for individual examples).
-
      - dataset.<property> returns the value of a property associated with
      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
@@ -151,9 +147,9 @@
        - __len__ if it is not a stream
        - fieldNames
        - minibatches_nowrap (called by DataSet.minibatches())
+    For efficiency of implementation, a sub-class might also want to redefine
        - valuesHStack
        - valuesVStack
-    For efficiency of implementation, a sub-class might also want to redefine
        - hasFields
        - __getitem__ may not be feasible with some streams
        - __iter__
@@ -278,7 +274,7 @@
                     first_part = self.dataset.minibatches_nowrap(self.fieldnames,self.L-self.next_row,1,self.next_row).next()
                     second_part = self.dataset.minibatches_nowrap(self.fieldnames,upper-self.L,1,0).next()
                     minibatch = Example(self.fieldnames,
-                                        [self.dataset.valuesAppend(name,[first_part[name],second_part[name]])
+                                        [self.dataset.valuesVStack(name,[first_part[name],second_part[name]])
                                          for name in self.fieldnames])
             self.next_row=upper
             self.n_batches_done+=1
@@ -412,6 +408,20 @@
         """
         return DataSetFields(self,fieldnames)
 
+    def getitem_key(self, fieldname):
+        """A not-so-well thought-out place to put code that used to be in
+        getitem.
+        """
+        #removing as per discussion June 4. --JSB
+
+        i = fieldname
+        # else check for a fieldname
+        if self.hasFields(i):
+            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
+        # else we are trying to access a property of the dataset
+        assert i in self.__dict__ # else it means we are trying to access a non-existing property
+        return self.__dict__[i]
+
     def __getitem__(self,i):
         """
         dataset[i] returns the (i+1)-th example of the dataset.
@@ -460,12 +470,7 @@
                                             for fieldname,field_values
                                             in zip(self.fieldNames(),fields_values)]),
                 self.valuesVStack,self.valuesHStack)
-        # else check for a fieldname
-        if self.hasFields(i):
-            return self.minibatches(fieldnames=[i],minibatch_size=len(self),n_batches=1,offset=0).next()[0]
-        # else we are trying to access a property of the dataset
-        assert i in self.__dict__ # else it means we are trying to access a non-existing property
-        return self.__dict__[i]
+        raise TypeError(i, type(i))
 
     def valuesHStack(self,fieldnames,fieldvalues):
         """
@@ -953,25 +958,16 @@
     Virtual super-class of datasets whose field values are numpy array,
     thus defining valuesHStack and valuesVStack for sub-classes.
     """
-    def __init__(self, description=None, field_types=None):
-        DataSet.__init__(self, description, field_types)
-    def valuesHStack(self, fieldnames, fieldvalues):
+    def __init__(self,description=None,field_types=None):
+        DataSet.__init__(self,description,field_types)
+    def valuesHStack(self,fieldnames,fieldvalues):
         """Concatenate field values horizontally, e.g. two vectors
         become a longer vector, two matrices become a wider matrix, etc."""
         return numpy.hstack(fieldvalues)
-    def valuesVStack(self, fieldname, values):
+    def valuesVStack(self,fieldname,values):
         """Concatenate field values vertically, e.g. two vectors
         become a two-row matrix, two matrices become a longer matrix, etc."""
         return numpy.vstack(values)
-    def valuesAppend(self, fieldname, values):
-        s0 = sum([v.shape[0] for v in values])
-        #TODO: there's gotta be a better way to do this!
-        rval = numpy.ndarray([s0] + values[0].shape[1:],dtype=values[0].dtype)
-        cur_row = 0
-        for v in values:
-            rval[cur_row:cur_row+v.shape[0]] = v
-            cur_row += v.shape[0]
-        return rval
 
 class ArrayDataSet(ArrayFieldsDataSet):
     """
@@ -996,7 +992,7 @@
         for fieldname, fieldcolumns in self.fields_columns.items():
             if type(fieldcolumns) is int:
                 assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1]
-                if 0:
+                if 1:
                     #I changed this because it didn't make sense to me,
                     # and it made it more difficult to write my learner.
                     # If it breaks stuff, let's talk about it.
--- a/learner.py	Tue Jun 03 16:13:42 2008 -0400
+++ b/learner.py	Thu Jun 05 11:47:44 2008 -0400
@@ -3,24 +3,24 @@
 from exceptions import *
 from dataset import AttributesHolder
 
-class LearningAlgorithm(object):
+class OfflineLearningAlgorithm(object):
     """
-    Base class for learning algorithms, provides an interface
+    Base class for offline learning algorithms, provides an interface
     that allows various algorithms to be applicable to generic learning
     algorithms. It is only given here to define the expected semantics.
 
-    A L{Learner} can be seen as a learning algorithm, a function that when
+    An offline learning algorithm can be seen as a function that when
     applied to training data returns a learned function (which is an object that
     can be applied to other data and return some output data).
 
-    There are two main ways of using a learning algorithms, and some learning
-    algorithms only support one of them. The first is the way of the standard
-    machine learning framework, in which a learning algorithm is applied
+    The offline learning scenario is the standard and most common one 
+    in machine learning:  an offline learning algorithm is applied
     to a training dataset, 
 
        model = learning_algorithm(training_set)
         
-    resulting in a fully trained model that can be applied to another dataset:
+    resulting in a fully trained model that can be applied to another dataset
+    in order to perform some desired computation:
 
         output_dataset = model(input_dataset)
 
@@ -28,10 +28,58 @@
     In that example, the training set may for example have 'input' and 'target'
     fields while the input dataset may have only 'input' (or both 'input' and
     'target') and the output dataset would contain some default output fields defined
-    by the learning algorithm (e.g. 'output' and 'error').
+    by the learning algorithm (e.g. 'output' and 'error'). The user may specifiy
+    what the output dataset should contain either by setting options in the
+    model, by the presence of particular fields in the input dataset, or with
+    keyword options of the __call__ method of the model (see LearnedModel.__call__).
+
+    """
+
+    def __init__(self): pass
+
+    def __call__(self, training_dataset):
+        """
+        Return a fully trained TrainedModel.
+        """
+        raise AbstractFunction()
+    
+class TrainedModel(AttributesHolder):
+    """
+    TrainedModel is a base class for models returned by instances of an
+    OfflineLearningAlgorithm subclass. It is only given here to define the expected semantics.
+    """
+    def __init__(self):
+        pass
 
-    The second way of using a learning algorithm is in the online or
-    adaptive framework, where the training data are only revealed in pieces
+    def __call__(self,input_dataset,output_fieldnames=None,
+                 test_stats_collector=None,copy_inputs=False,
+                 put_stats_in_output_dataset=True,
+                 output_attributes=[]):
+        """
+        A L{TrainedModel} can be used with
+        with one or more calls to it. The main argument is an input L{DataSet} (possibly
+        containing a single example) and the result is an output L{DataSet} of the same length.
+        If output_fieldnames is specified, it may be use to indicate which fields should
+        be constructed in the output L{DataSet} (for example ['output','classification_error']).
+        Otherwise, some default output fields are produced (possibly depending on the input
+        fields available in the input_dataset).
+        Optionally, if copy_inputs, the input fields (of the input_dataset) can be made
+        visible in the output L{DataSet} returned by this method.
+        Optionally, attributes of the learner can be copied in the output dataset,
+        and statistics computed by the stats collector also put in the output dataset.
+        Note the distinction between fields (which are example-wise quantities, e.g. 'input')
+        and attributes (which are not, e.g. 'regularization_term').
+        """
+        raise AbstractFunction()
+
+
+class OnlineLearningAlgorithm(object):
+    """
+    Base class for online learning algorithms, provides an interface
+    that allows various algorithms to be applicable to generic online learning
+    algorithms. It is only given here to define the expected semantics.
+
+    The basic setting is that the training data are only revealed in pieces
     (maybe one example or a batch of example at a time):
 
        model = learning_algorithm()
@@ -49,6 +97,9 @@
     
        output_dataset = model(input_dataset)
 
+    The model should be a LearnerModel subclass instance, and LearnerModel
+    is a subclass of LearnedModel.
+
     """
 
     def __init__(self): pass
@@ -59,7 +110,7 @@
         """
         raise AbstractFunction()
     
-class LearnerModel(AttributesHolder):
+class LearnerModel(LearnedModel):
     """
     LearnerModel is a base class for models returned by instances of a LearningAlgorithm subclass.
     It is only given here to define the expected semantics.
@@ -69,7 +120,7 @@
 
     def update(self,training_set,train_stats_collector=None):
         """
-        Continue training a learner, with the evidence provided by the given training set.
+        Continue training a learner model, with the evidence provided by the given training set.
         Hence update can be called multiple times. This is the main method used for training in the
         on-line setting or the sequential (Bayesian or not) settings.
 
@@ -82,23 +133,3 @@
         """
         raise AbstractFunction()
     
-    def __call__(self,input_dataset,output_fieldnames=None,
-                 test_stats_collector=None,copy_inputs=False,
-                 put_stats_in_output_dataset=True,
-                 output_attributes=[]):
-        """
-        A trained or partially trained L{Model} can be used with
-        with one or more calls to it. The argument is an input L{DataSet} (possibly
-        containing a single example) and the result is an output L{DataSet} of the same length.
-        If output_fieldnames is specified, it may be use to indicate which fields should
-        be constructed in the output L{DataSet} (for example ['output','classification_error']).
-        Otherwise, some default output fields are produced (possibly depending on the input
-        fields available in the input_dataset).
-        Optionally, if copy_inputs, the input fields (of the input_dataset) can be made
-        visible in the output L{DataSet} returned by this method.
-        Optionally, attributes of the learner can be copied in the output dataset,
-        and statistics computed by the stats collector also put in the output dataset.
-        Note the distinction between fields (which are example-wise quantities, e.g. 'input')
-        and attributes (which are not, e.g. 'regularization_term').
-        """
-        raise AbstractFunction()
--- a/mlp_factory_approach.py	Tue Jun 03 16:13:42 2008 -0400
+++ b/mlp_factory_approach.py	Thu Jun 05 11:47:44 2008 -0400
@@ -1,10 +1,11 @@
-import copy, sys
+import copy, sys, os
 import numpy
 
 import theano
 from theano import tensor as T
 
-from pylearn import dataset, nnet_ops, stopper, LookupList
+from pylearn import dataset, nnet_ops, stopper, LookupList, filetensor
+
 
 class AbstractFunction (Exception): pass
 
@@ -35,6 +36,17 @@
             raise Exception('why not called?') 
             return GraphLearner.Model(self.algo, [copy.copy(p) for p in params])
 
+        def __eq__(self,other,tolerance=0.) :
+            """ Only compares weights of matrices and bias vector. """
+            if not isinstance(other,GraphLearner.Model) :
+                return False
+            for p in range(4) :
+                if self.params[p].shape != other.params[p].shape :
+                    return False
+                if not numpy.all( numpy.abs(self.params[p] - other.params[p]) <= tolerance ) :                    
+                    return False
+            return True
+
         def _cache(self, key, valfn):
             d = self._fn_cache
             if key not in d:
@@ -42,7 +54,7 @@
             return d[key]
 
         def update_minibatch(self, minibatch):
-            assert isinstance(minibatch, LookupList)
+            #assert isinstance(minibatch, LookupList) # why false???
             self.update_fn(minibatch['input'], minibatch['target'], *self.params)
 
         def update(self, dataset, 
@@ -53,6 +65,9 @@
             for mb in dataset.minibatches(['input', 'target'], minibatch_size=minibatch_size):
                 self.update_minibatch(mb)
 
+        def save(self, f):
+            self.algo.graph.save(f, self)
+
         def __call__(self, testset, fieldnames=['output_class']):
             """Apply this model (as a function) to new data.
 
@@ -111,6 +126,13 @@
             raise AbstractFunction
         optimizer = Opt()
 
+        def load(self,f) :
+            raise AbstractFunction
+
+        def save(self,f,model) :
+            raise AbstractFunction
+
+
     def __init__(self, graph):
         self.graph = graph
 
@@ -145,7 +167,15 @@
         @rtype: GraphLearner.Model instance
         
         """
+        
         iparams = self.graph.iparams() if iparams is None else iparams
+
+        # if we load, type(trainset) == 'str'
+        if isinstance(trainset,str) or isinstance(trainset,file):
+            #loadmodel = GraphLearner.Model(self, iparams)
+            loadmodel = self.graph.load(self,trainset)
+            return loadmodel
+
         curmodel = GraphLearner.Model(self, iparams)
         best = curmodel
         
@@ -166,7 +196,10 @@
                 curmodel = best
         return curmodel
 
+
 def graphMLP(ninputs, nhid, nclass, lr_val, l2coef_val=0.0):
+
+
     def wrapper(i, node, thunk):
         if 0:
             print i, node
@@ -199,7 +232,39 @@
         g_params = T.grad(nll, params)
         new_params = [T.sub_inplace(p, lr * gp) for p,gp in zip(params, g_params)]
 
+            
+        def __eq__(self,other) :
+            print 'G.__eq__ from graphMLP(), not implemented yet'
+            return NotImplemented
+
+
+        def load(self, algo, f):
+            """ Load from file the 2 matrices and bias vectors """
+            cloase_at_end = False
+            if isinstance(f,str) :
+                f = open(f,'r')
+                close_at_end = True
+            params = []
+            for i in xrange(4):
+                params.append(filetensor.read(f))
+            if close_at_end :
+                f.close()
+            return GraphLearner.Model(algo, params)
+
+        def save(self, f, model):
+            """ Save params to file, so 2 matrices and 2 bias vectors. Same order as iparams. """
+            cloase_at_end = False
+            if isinstance(f,str) :
+                f = open(f,'w')
+                close_at_end = True
+            for p in model.params:
+                filetensor.write(f,p)
+            if close_at_end :
+                f.close()
+
+
         def iparams(self):
+            """ init params. """
             def randsmall(*shape): 
                 return (numpy.random.rand(*shape) -0.5) * 0.001
             return [randsmall(ninputs, nhid)
@@ -250,6 +315,26 @@
         self.failUnless(n_match ==  (numpy.sum(training_set1.fields()['target'] ==
                 training_set2.fields()['target'])), omatch)
 
+        model1.save('/tmp/model1')
+        
+        #denoising_aa = GraphLearner(denoising_g)
+        #model1 = denoising_aa(trainset)
+        #hidset = model(trainset, fieldnames=['hidden'])
+        #model2 = denoising_aa(hidset)
+        
+        #f = open('blah', 'w')
+        #for m in model:
+        #    m.save(f)
+        #filetensor.write(f, initial_classification_weights)
+        #f.flush()
+
+        #deep_sigmoid_net = GraphLearner(deepnetwork_g)
+        #deep_model = deep_sigmoid_net.load('blah')
+        #deep_model.update(trainset)  #do some fine tuning
+
+        model1_dup = learn_algo('/tmp/model1')
+
+
     def equiv(self, g0, g1):
         training_set1 = dataset.ArrayDataSet(numpy.array([[0, 0, 0],
                                                          [0, 1, 1],
--- a/test_dataset.py	Tue Jun 03 16:13:42 2008 -0400
+++ b/test_dataset.py	Thu Jun 05 11:47:44 2008 -0400
@@ -305,49 +305,52 @@
 #ds[fieldname]# an iterable over the values of the field fieldname across
   #the ds (the iterable is obtained by default by calling valuesVStack
   #over the values for individual examples).
-    assert have_raised("ds['h']")  # h is not defined...
-    assert have_raised("ds[['x']]")  # bad syntax
-    assert not have_raised("var['ds']['x']",ds=ds)
-    isinstance(ds['x'],DataSetFields)
-    ds2=ds['x']
-    assert len(ds['x'])==10
-    assert len(ds['y'])==10
-    assert len(ds['z'])==10
-    i=0
-    for example in ds['x']:
-        assert (example==array[i][:3]).all()
-        i+=1
-    assert i==len(ds)
-    i=0
-    for example in ds['y']:
-        assert (example==array[i][3]).all()
-        i+=1
-    assert i==len(ds)
-    i=0
-    for example in ds['z']:
-        assert (example==array[i,0:3:2]).all()
-        i+=1
-    assert i==len(ds)
-    del ds2,i
+    if 0:
+        assert have_raised("ds['h']")  # h is not defined...
+        assert have_raised("ds[['x']]")  # bad syntax
+        assert not have_raised("var['ds']['x']",ds=ds)
+        isinstance(ds['x'],DataSetFields)
+        ds2=ds['x']
+        assert len(ds['x'])==10
+        assert len(ds['y'])==10
+        assert len(ds['z'])==10
+        i=0
+        for example in ds['x']:
+            assert (example==array[i][:3]).all()
+            i+=1
+        assert i==len(ds)
+        i=0
+        for example in ds['y']:
+            assert (example==array[i][3]).all()
+            i+=1
+        assert i==len(ds)
+        i=0
+        for example in ds['z']:
+            assert (example==array[i,0:3:2]).all()
+            i+=1
+        assert i==len(ds)
+        del ds2,i
+    else:
+        print 'warning: ds[fieldname] is deprecated... Fred could you fix this test?'
 
-#ds.<property># returns the value of a property associated with
-  #the name <property>. The following properties should be supported:
-  #    - 'description': a textual description or name for the ds
-  #    - 'fieldtypes': a list of types (one per field)
+    #ds.<property># returns the value of a property associated with
+      #the name <property>. The following properties should be supported:
+      #    - 'description': a textual description or name for the ds
+      #    - 'fieldtypes': a list of types (one per field)
 
-#* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
-    #assert hstack([ds('x','y'),ds('z')])==ds
-    #hstack([ds('z','y'),ds('x')])==ds
+    #* ds1 | ds2 | ds3 == ds.hstack([ds1,ds2,ds3])#????
+        #assert hstack([ds('x','y'),ds('z')])==ds
+        #hstack([ds('z','y'),ds('x')])==ds
     assert have_raised2(hstack,[ds('x'),ds('x')])
     assert have_raised2(hstack,[ds('y','x'),ds('x')])
     assert not have_raised2(hstack,[ds('x'),ds('y')])
-    
-#        i=0
-#        for example in hstack([ds('x'),ds('y'),ds('z')]):
-#            example==ds[i]
-#            i+=1 
-#        del i,example
-#* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
+        
+    #        i=0
+    #        for example in hstack([ds('x'),ds('y'),ds('z')]):
+    #            example==ds[i]
+    #            i+=1 
+    #        del i,example
+    #* ds1 & ds2 & ds3 == ds.vstack([ds1,ds2,ds3])#????
 
 def test_fields_fct(ds):
     #@todo, fill correctly
@@ -544,8 +547,6 @@
     f_array_iter(array)
 
     f_ds_index(ds)
-    f_ds_index(ds)
-    f_ds_iter(ds)
     f_ds_iter(ds)
 
     f_ds_mb1(ds,10)