Mercurial > pylearn
changeset 226:3595ba2610f7
merged
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Fri, 23 May 2008 17:12:12 -0400 |
parents | 8bc16220b29a (current diff) 517364d48ae0 (diff) |
children | 17c5d080964b |
files | mlp_factory_approach.py |
diffstat | 5 files changed, 145 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/_test_dataset.py Fri May 23 17:11:39 2008 -0400 +++ b/_test_dataset.py Fri May 23 17:12:12 2008 -0400 @@ -1,6 +1,8 @@ from dataset import * from math import * import unittest +import sys +import numpy as N def _sum_all(a): s=a @@ -92,6 +94,90 @@ print b('x+y') + + +# to be used with a any new dataset +class T_dataset_tester(object): + """ + This class' goal is to test any new dataset that is created + Tests are (will be!) designed to check the normal behaviours + of a dataset, as defined in dataset.py + """ + + + def __init__(self,ds,runall=True) : + """if interested in only a subset of test, init with runall=False""" + self.ds = ds + + if runall : + self.test1_basicstats(ds) + self.test2_slicing(ds) + self.test3_fields_iterator_consistency(ds) + + def test1_basicstats(self,ds) : + """print basics stats on a dataset, like length""" + + print 'len(ds) = ',len(ds) + print 'num fields = ', len(ds.fieldNames()) + print 'types of field: ', + for k in ds.fieldNames() : + print type(ds[0](k)[0]), + print '' + + def test2_slicing(self,ds) : + """test if slicing works properly""" + print 'testing slicing...', + sys.stdout.flush() + + middle = len(ds) / 2 + tenpercent = int(len(ds) * .1) + set1 = ds[:middle+tenpercent] + set2 = ds[middle-tenpercent:] + for k in range(tenpercent + tenpercent -1): + for k2 in ds.fieldNames() : + if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray : + for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) : + assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3] + else : + assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0] + assert tenpercent > 1 + set3 = ds[middle-tenpercent:middle+tenpercent:2] + for k2 in ds.fieldNames() : + if type(set2[2](k2)[0]) == N.ndarray : + for k3 in range(len(set2[2](k2)[0])) : + assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3] + else : + assert set2[2](k2)[0] == set3[1](k2)[0] + + print 'done' + + + def test3_fields_iterator_consistency(self,ds) : + """ check if the number of iterator corresponds to the number of fields""" + print 'testing fields/iterator consistency...', + sys.stdout.flush() + + # basic test + maxsize = min(len(ds)-1,100) + for iter in ds[:maxsize] : + assert len(iter) == len(ds.fieldNames()) + if len(ds.fieldNames()) == 1 : + print 'done' + return + + # with minibatches iterator + ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2) + for iter in ds2 : + assert len(iter) == 2 + + print 'done' + + + + + +################################################################### +# main if __name__ == '__main__': unittest.main()
--- a/dataset.py Fri May 23 17:11:39 2008 -0400 +++ b/dataset.py Fri May 23 17:12:12 2008 -0400 @@ -245,8 +245,7 @@ if n_batches is not None: ds_nbatches = min(n_batches,ds_nbatches) if fieldnames: - if not dataset.hasFields(*fieldnames): - raise ValueError('field not present', fieldnames) + assert dataset.hasFields(*fieldnames) else: self.fieldnames=dataset.fieldNames() self.iterator = self.dataset.minibatches_nowrap(self.fieldnames,self.minibatch_size, @@ -670,6 +669,11 @@ assert len(fields_lookuplist)>0 self.length=len(fields_lookuplist[0]) for field in fields_lookuplist[1:]: + if self.length != len(field) : + print 'self.length = ',self.length + print 'len(field) = ', len(field) + print 'self._fields.keys() = ', self._fields.keys() + print 'field=',field assert self.length==len(field) self.values_vstack=values_vstack self.values_hstack=values_hstack @@ -698,8 +702,13 @@ return True def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + #@TODO bug somewhere here, fieldnames doesnt seem to be well handled class Iterator(object): - def __init__(self,ds): + def __init__(self,ds,fieldnames): + # tbm: added two next lines to handle fieldnames + if fieldnames is None: fieldnames = ds._fields.keys() + self.fieldnames = fieldnames + self.ds=ds self.next_example=offset assert minibatch_size > 0 @@ -710,13 +719,21 @@ def next(self): upper = self.next_example+minibatch_size assert upper<=self.ds.length - minibatch = Example(self.ds._fields.keys(), - [field[self.next_example:upper] - for field in self.ds._fields]) + #minibatch = Example(self.ds._fields.keys(), + # [field[self.next_example:upper] + # for field in self.ds._fields]) + # tbm: modif to use fieldnames + values = [] + for f in self.fieldnames : + #print 'we have field',f,'in fieldnames' + values.append( self.ds._fields[f][self.next_example:upper] ) + minibatch = Example(self.fieldnames,values) + #print minibatch self.next_example+=minibatch_size return minibatch - return Iterator(self) + # tbm: added fieldnames to handle subset of fieldnames + return Iterator(self,fieldnames) def valuesVStack(self,fieldname,fieldvalues): return self.values_vstack(fieldname,fieldvalues) @@ -970,16 +987,7 @@ for fieldname, fieldcolumns in self.fields_columns.items(): if type(fieldcolumns) is int: assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] - - if 0: - #I changed this because it didn't make sense to me, - # and it made it more difficult to write my learner. - # If it breaks stuff, let's talk about it. - # - James 22/05/2008 - self.fields_columns[fieldname]=[fieldcolumns] - else: - self.fields_columns[fieldname]=fieldcolumns - + self.fields_columns[fieldname]=[fieldcolumns] elif type(fieldcolumns) is slice: start,step=None,None if not fieldcolumns.start:
--- a/denoising_aa.py Fri May 23 17:11:39 2008 -0400 +++ b/denoising_aa.py Fri May 23 17:12:12 2008 -0400 @@ -31,6 +31,7 @@ def squash_affine_formula(squash_function=sigmoid): """ + Simply does: squash_function(b + xW) By convention prefix the parameters by _ """ class SquashAffineFormula(Formulas): @@ -53,7 +54,7 @@ class ProbabilisticClassifierLossFormula(Formulas): a = t.matrix() # of dimensions minibatch_size x n_classes, pre-softmax output target_class = t.ivector() # dimension (minibatch_size) - nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) + nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) # defined in nnet_ops.py return ProbabilisticClassifierLossFormula() def binomial_cross_entropy_formula(): @@ -64,6 +65,8 @@ # using the identity softplus(a) - softplus(-a) = a, # we obtain that q log(p) + (1-q) log(1-p) = q a - softplus(a) nll = -t.sum(q*a - softplus(-a)) + # next line was missing... hope it's all correct above + return BinomialCrossEntropyFormula() def squash_affine_autoencoder_formula(hidden_squash=t.tanh, reconstruction_squash=sigmoid, @@ -102,9 +105,33 @@ self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x') def __call__(self, training_set=None): + """ Allocate and optionnaly train a model""" model = DenoisingAutoEncoderModel(self) if training_set: - print 'what do I do if training set????' + print 'DenoisingAutoEncoder(): what do I do if training_set????' + # copied from mlp_factory_approach: + if len(trainset) == sys.maxint: + raise NotImplementedError('Learning from infinite streams is not supported') + nval = int(self.validation_portion * len(trainset)) + nmin = len(trainset) - nval + assert nmin >= 0 + minset = trainset[:nmin] #real training set for minimizing loss + valset = trainset[nmin:] #validation set for early stopping + best = model + for stp in self.early_stopper(): + model.update( + minset.minibatches([input, target], minibatch_size=min(32, + len(trainset)))) + #print 'mlp.__call__(), we did an update' + if stp.set_score: + stp.score = model(valset, ['loss_01']) + if (stp.score < stp.best_score): + best = copy.copy(model) + model = best + # end of the copy from mlp_factory_approach + + return model + def compile(self, inputs, outputs): return theano.function(inputs,outputs,unpack_single=False,linker=self.linker)
--- a/mlp_factory_approach.py Fri May 23 17:11:39 2008 -0400 +++ b/mlp_factory_approach.py Fri May 23 17:12:12 2008 -0400 @@ -17,7 +17,7 @@ import theano from theano import tensor as t -import dataset, nnet_ops, stopper +from pylearn import dataset, nnet_ops, stopper def _randshape(*shape): @@ -44,18 +44,19 @@ """Update this model from more training data.""" params = self.params #TODO: why should we have to unpack target like this? + # tbm : creates problem... for input, target in input_target: rval= self.update_fn(input, target[:,0], *params) #print rval[0] - def __call__(self, testset, fieldnames=['output_class']): + def __call__(self, testset, fieldnames=['output_class'],input='input',target='target'): """Apply this model (as a function) to new data""" #TODO: cache fn between calls - assert 'input' == testset.fieldNames()[0] + assert input == testset.fieldNames()[0] # why first one??? assert len(testset.fieldNames()) <= 2 v = self.algo.v outputs = [getattr(v, name) for name in fieldnames] - inputs = [v.input] + ([v.target] if 'target' in testset else []) + inputs = [v.input] + ([v.target] if target in testset else []) inputs.extend(v.params) theano_fn = _cache(self._fn_cache, (tuple(inputs), tuple(outputs)), lambda: self.algo._fn(inputs, outputs))