Mercurial > pylearn
changeset 231:38beb81f4e8b
Automated merge with ssh://projects@lgcm.iro.umontreal.ca/hg/pylearn
author | Frederic Bastien <bastienf@iro.umontreal.ca> |
---|---|
date | Tue, 27 May 2008 13:46:03 -0400 |
parents | 17c5d080964b (diff) 4d1bd2513e06 (current diff) |
children | c047238e5b3f 9e96fe8b955c |
files | dataset.py test_dataset.py |
diffstat | 9 files changed, 375 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/__init__.py Fri May 23 10:21:52 2008 -0400 +++ b/__init__.py Tue May 27 13:46:03 2008 -0400 @@ -1,2 +1,5 @@ import filetensor import nnet_ops + +from lookup_list import LookupList +
--- a/_test_dataset.py Fri May 23 10:21:52 2008 -0400 +++ b/_test_dataset.py Tue May 27 13:46:03 2008 -0400 @@ -1,6 +1,8 @@ from dataset import * from math import * import unittest +import sys +import numpy as N def _sum_all(a): s=a @@ -92,6 +94,90 @@ print b('x+y') + + +# to be used with a any new dataset +class T_dataset_tester(object): + """ + This class' goal is to test any new dataset that is created + Tests are (will be!) designed to check the normal behaviours + of a dataset, as defined in dataset.py + """ + + + def __init__(self,ds,runall=True) : + """if interested in only a subset of test, init with runall=False""" + self.ds = ds + + if runall : + self.test1_basicstats(ds) + self.test2_slicing(ds) + self.test3_fields_iterator_consistency(ds) + + def test1_basicstats(self,ds) : + """print basics stats on a dataset, like length""" + + print 'len(ds) = ',len(ds) + print 'num fields = ', len(ds.fieldNames()) + print 'types of field: ', + for k in ds.fieldNames() : + print type(ds[0](k)[0]), + print '' + + def test2_slicing(self,ds) : + """test if slicing works properly""" + print 'testing slicing...', + sys.stdout.flush() + + middle = len(ds) / 2 + tenpercent = int(len(ds) * .1) + set1 = ds[:middle+tenpercent] + set2 = ds[middle-tenpercent:] + for k in range(tenpercent + tenpercent -1): + for k2 in ds.fieldNames() : + if type(set1[middle-tenpercent+k](k2)[0]) == N.ndarray : + for k3 in range(len(set1[middle-tenpercent+k](k2)[0])) : + assert set1[middle-tenpercent+k](k2)[0][k3] == set2[k](k2)[0][k3] + else : + assert set1[middle-tenpercent+k](k2)[0] == set2[k](k2)[0] + assert tenpercent > 1 + set3 = ds[middle-tenpercent:middle+tenpercent:2] + for k2 in ds.fieldNames() : + if type(set2[2](k2)[0]) == N.ndarray : + for k3 in range(len(set2[2](k2)[0])) : + assert set2[2](k2)[0][k3] == set3[1](k2)[0][k3] + else : + assert set2[2](k2)[0] == set3[1](k2)[0] + + print 'done' + + + def test3_fields_iterator_consistency(self,ds) : + """ check if the number of iterator corresponds to the number of fields""" + print 'testing fields/iterator consistency...', + sys.stdout.flush() + + # basic test + maxsize = min(len(ds)-1,100) + for iter in ds[:maxsize] : + assert len(iter) == len(ds.fieldNames()) + if len(ds.fieldNames()) == 1 : + print 'done' + return + + # with minibatches iterator + ds2 = ds.minibatches[:maxsize]([ds.fieldNames()[0],ds.fieldNames()[1]],minibatch_size=2) + for iter in ds2 : + assert len(iter) == 2 + + print 'done' + + + + + +################################################################### +# main if __name__ == '__main__': unittest.main()
--- a/dataset.py Fri May 23 10:21:52 2008 -0400 +++ b/dataset.py Tue May 27 13:46:03 2008 -0400 @@ -442,7 +442,9 @@ rows=None # or a slice if type(i) is slice: + #print 'i=',i if not i.start: i=slice(0,i.stop,i.step) + if not i.stop: i=slice(i.start,len(self),i.step) if not i.step: i=slice(i.start,i.stop,1) if i.step is 1: return self.minibatches(minibatch_size=i.stop-i.start,n_batches=1,offset=i.start).next().examples() @@ -662,10 +664,16 @@ and a values_hstack(fieldnames,fieldvalues) functions behaving with the same semantics as the DataSet methods of the same name (but without the self argument). """ + self._fields=fields_lookuplist assert len(fields_lookuplist)>0 self.length=len(fields_lookuplist[0]) for field in fields_lookuplist[1:]: + if self.length != len(field) : + print 'self.length = ',self.length + print 'len(field) = ', len(field) + print 'self._fields.keys() = ', self._fields.keys() + print 'field=',field assert self.length==len(field) self.values_vstack=values_vstack self.values_hstack=values_hstack @@ -694,8 +702,13 @@ return True def minibatches_nowrap(self,fieldnames,minibatch_size,n_batches,offset): + #@TODO bug somewhere here, fieldnames doesnt seem to be well handled class Iterator(object): - def __init__(self,ds): + def __init__(self,ds,fieldnames): + # tbm: added two next lines to handle fieldnames + if fieldnames is None: fieldnames = ds._fields.keys() + self.fieldnames = fieldnames + self.ds=ds self.next_example=offset assert minibatch_size > 0 @@ -706,13 +719,21 @@ def next(self): upper = self.next_example+minibatch_size assert upper<=self.ds.length - minibatch = Example(self.ds._fields.keys(), - [field[self.next_example:upper] - for field in self.ds._fields]) + #minibatch = Example(self.ds._fields.keys(), + # [field[self.next_example:upper] + # for field in self.ds._fields]) + # tbm: modif to use fieldnames + values = [] + for f in self.fieldnames : + #print 'we have field',f,'in fieldnames' + values.append( self.ds._fields[f][self.next_example:upper] ) + minibatch = Example(self.fieldnames,values) + #print minibatch self.next_example+=minibatch_size return minibatch - return Iterator(self) + # tbm: added fieldnames to handle subset of fieldnames + return Iterator(self,fieldnames) def valuesVStack(self,fieldname,fieldvalues): return self.values_vstack(fieldname,fieldvalues) @@ -966,7 +987,14 @@ for fieldname, fieldcolumns in self.fields_columns.items(): if type(fieldcolumns) is int: assert fieldcolumns>=0 and fieldcolumns<data_array.shape[1] - self.fields_columns[fieldname]=[fieldcolumns] + if 0: + #I changed this because it didn't make sense to me, + # and it made it more difficult to write my learner. + # If it breaks stuff, let's talk about it. + # - James 22/05/2008 + self.fields_columns[fieldname]=[fieldcolumns] + else: + self.fields_columns[fieldname]=fieldcolumns elif type(fieldcolumns) is slice: start,step=None,None if not fieldcolumns.start: @@ -1165,6 +1193,9 @@ Note that the expected semantics of the function differs in minibatch mode (it takes minibatches of inputs and produces minibatches of outputs, as documented in the class comment). + + TBM: are filedtypes the old field types (from input_dataset) or the new ones + (for the new dataset created)? """ self.input_dataset=input_dataset self.function=function @@ -1207,6 +1238,7 @@ return all_outputs return Example(fieldnames,[all_outputs[name] for name in fieldnames]) + return ApplyFunctionIterator(self) def __iter__(self): # only implemented for increased efficiency
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/denoising_aa.py Tue May 27 13:46:03 2008 -0400 @@ -0,0 +1,216 @@ +""" +A denoising auto-encoder +""" + +import theano +from theano.formula import * +from learner import * +from theano import tensor as t +from nnet_ops import * +import math +from misc import * +from theano.tensor_random import binomial + +def hiding_corruption_formula(seed,average_fraction_hidden): + """ + Return a formula for the corruption process, in which a random + subset of the input numbers are hidden (mapped to 0). + + @param seed: seed of the random generator + @type seed: anything that numpy.random.RandomState accepts + + @param average_fraction_hidden: the probability with which each + input number is hidden (set to 0). + @type average_fraction_hidden: 0 <= real number <= 1 + """ + class HidingCorruptionFormula(Formulas): + x = t.matrix() + corrupted_x = x * binomial(seed,x,1,fraction_sampled) + + return HidingCorruptionFormula() + +def squash_affine_formula(squash_function=sigmoid): + """ + Simply does: squash_function(b + xW) + By convention prefix the parameters by _ + """ + class SquashAffineFormula(Formulas): + x = t.matrix() # of dimensions minibatch_size x n_inputs + _b = t.row() # of dimensions 1 x n_outputs + _W = t.matrix() # of dimensions n_inputs x n_outputs + a = _b + t.dot(x,_W) # of dimensions minibatch_size x n_outputs + y = squash_function(a) + return SquashAffineFormula() + +def gradient_descent_update_formula(): + class GradientDescentUpdateFormula(Formula): + param = t.matrix() + learning_rate = t.scalar() + cost = t.column() # cost of each example in a minibatch + param_update = t.add_inplace(param, -learning_rate*t.sgrad(cost)) + return gradient_descent_update_formula() + +def probabilistic_classifier_loss_formula(): + class ProbabilisticClassifierLossFormula(Formulas): + a = t.matrix() # of dimensions minibatch_size x n_classes, pre-softmax output + target_class = t.ivector() # dimension (minibatch_size) + nll, probability_predictions = crossentropy_softmax_1hot(a, target_class) # defined in nnet_ops.py + return ProbabilisticClassifierLossFormula() + +def binomial_cross_entropy_formula(): + class BinomialCrossEntropyFormula(Formulas): + a = t.matrix() # pre-sigmoid activations, minibatch_size x dim + p = sigmoid(a) # model prediction + q = t.matrix() # target binomial probabilities, minibatch_size x dim + # using the identity softplus(a) - softplus(-a) = a, + # we obtain that q log(p) + (1-q) log(1-p) = q a - softplus(a) + nll = -t.sum(q*a - softplus(-a)) + # next line was missing... hope it's all correct above + return BinomialCrossEntropyFormula() + +def squash_affine_autoencoder_formula(hidden_squash=t.tanh, + reconstruction_squash=sigmoid, + share_weights=True, + reconstruction_nll_formula=binomial_cross_entropy_formula(), + update_formula=gradient_descent_update_formula): + if share_weights: + autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a') + \ + squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c') + \ + reconstruction_nll_formula + else: + autoencoder = squash_affine_formula(hidden_squash).rename(a='code_a',_W='_W1') + \ + squash_affine_formula(reconstruction_squash).rename(x='hidden',y='reconstruction',_b='_c',_W='_W2') + \ + reconstruction_nll_formula + autoencoder = autoencoder + [update_formula().rename(cost = 'nll', + param = p) + for p in autoencoder.get_all('_.*')] + return autoencoder + + +# @todo: try other corruption formulae. The above is the default one. +# not quite used in the ICML paper... (had a fixed number of 0s). + +class DenoisingAutoEncoder(LearningAlgorithm): + + def __init__(self,n_inputs,n_hidden_per_layer, + learning_rate=0.1, + max_n_epochs=100, + L1_regularizer=0, + init_range=1., + corruption_formula = hiding_corruption_formula(), + autoencoder = squash_affine_autoencoder_formula(), + minibatch_size=None,linker = "c|py"): + for name,val in locals().items(): + if val is not self: self.__setattribute__(name,val) + self.denoising_autoencoder_formula = corruption_formula + autoencoder.rename(x='corrupted_x') + + def __call__(self, training_set=None): + """ Allocate and optionnaly train a model""" + model = DenoisingAutoEncoderModel(self) + if training_set: + print 'DenoisingAutoEncoder(): what do I do if training_set????' + # copied from mlp_factory_approach: + if len(trainset) == sys.maxint: + raise NotImplementedError('Learning from infinite streams is not supported') + nval = int(self.validation_portion * len(trainset)) + nmin = len(trainset) - nval + assert nmin >= 0 + minset = trainset[:nmin] #real training set for minimizing loss + valset = trainset[nmin:] #validation set for early stopping + best = model + for stp in self.early_stopper(): + model.update( + minset.minibatches([input, target], minibatch_size=min(32, + len(trainset)))) + #print 'mlp.__call__(), we did an update' + if stp.set_score: + stp.score = model(valset, ['loss_01']) + if (stp.score < stp.best_score): + best = copy.copy(model) + model = best + # end of the copy from mlp_factory_approach + + return model + + + def compile(self, inputs, outputs): + return theano.function(inputs,outputs,unpack_single=False,linker=self.linker) + +class DenoisingAutoEncoderModel(LearnerModel): + def __init__(self,learning_algorithm,params): + self.learning_algorithm=learning_algorithm + self.params=params + v = learning_algorithm.v + self.update_fn = learning_algorithm.compile(learning_algorithm.denoising_autoencoder_formula.inputs, + learning_algorithm.denoising_autoencoder_formula.outputs) + + def update(self, training_set, train_stats_collector=None): + + print 'dont update you crazy frog!' + +# old stuff + +# self._learning_rate = t.scalar('learning_rate') # this is the symbol +# self.L1_regularizer = L1_regularizer +# self._L1_regularizer = t.scalar('L1_regularizer') +# self._input = t.matrix('input') # n_examples x n_inputs +# self._W = t.matrix('W') +# self._b = t.row('b') +# self._c = t.row('b') +# self._regularization_term = self._L1_regularizer * t.sum(t.abs(self._W)) +# self._corrupted_input = corruption_process(self._input) +# self._hidden = t.tanh(self._b + t.dot(self._input, self._W.T)) +# self._reconstruction_activations =self._c+t.dot(self._hidden,self._W) +# self._nll,self._output = crossentropy_softmax_1hot(Print("output_activations")(self._output_activations),self._target_vector) +# self._output_class = t.argmax(self._output,1) +# self._class_error = t.neq(self._output_class,self._target_vector) +# self._minibatch_criterion = self._nll + self._regularization_term / t.shape(self._input)[0] +# OnlineGradientTLearner.__init__(self) + +# def attributeNames(self): +# return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"] + +# def parameterAttributes(self): +# return ["b1","W1", "b2", "W2"] + +# def updateMinibatchInputFields(self): +# return ["input","target"] + +# def updateEndOutputAttributes(self): +# return ["regularization_term"] + +# def lossAttribute(self): +# return "minibatch_criterion" + +# def defaultOutputFields(self, input_fields): +# output_fields = ["output", "output_class",] +# if "target" in input_fields: +# output_fields += ["class_error", "nll"] +# return output_fields + +# def allocate(self,minibatch): +# minibatch_n_inputs = minibatch["input"].shape[1] +# if not self._n_inputs: +# self._n_inputs = minibatch_n_inputs +# self.b1 = numpy.zeros((1,self._n_hidden)) +# self.b2 = numpy.zeros((1,self._n_outputs)) +# self.forget() +# elif self._n_inputs!=minibatch_n_inputs: +# # if the input changes dimension on the fly, we resize and forget everything +# self.forget() + +# def forget(self): +# if self._n_inputs: +# r = self._init_range/math.sqrt(self._n_inputs) +# self.W1 = numpy.random.uniform(low=-r,high=r, +# size=(self._n_hidden,self._n_inputs)) +# r = self._init_range/math.sqrt(self._n_hidden) +# self.W2 = numpy.random.uniform(low=-r,high=r, +# size=(self._n_outputs,self._n_hidden)) +# self.b1[:]=0 +# self.b2[:]=0 +# self._n_epochs=0 + +# def isLastEpoch(self): +# self._n_epochs +=1 +# return self._n_epochs>=self._max_n_epochs
--- a/learner.py Fri May 23 10:21:52 2008 -0400 +++ b/learner.py Tue May 27 13:46:03 2008 -0400 @@ -1,6 +1,7 @@ + from exceptions import * - +from dataset import AttributesHolder class LearningAlgorithm(object): """
--- a/mlp_factory_approach.py Fri May 23 10:21:52 2008 -0400 +++ b/mlp_factory_approach.py Tue May 27 13:46:03 2008 -0400 @@ -1,10 +1,24 @@ +""" + + + +This file is deprecated. I'm continuing development in hpu/models.py. + +Get that project like this: hg clone ssh://user@lgcm/../bergstrj/hpu + + + + + +""" import copy, sys import numpy import theano from theano import tensor as t -from tlearn import dataset, nnet_ops, stopper +from pylearn import dataset, nnet_ops, stopper + def _randshape(*shape): return (numpy.random.rand(*shape) -0.5) * 0.001 @@ -30,17 +44,19 @@ """Update this model from more training data.""" params = self.params #TODO: why should we have to unpack target like this? + # tbm : creates problem... for input, target in input_target: - self.update_fn(input, target[:,0], *params) + rval= self.update_fn(input, target[:,0], *params) + #print rval[0] - def __call__(self, testset, fieldnames=['output_class']): + def __call__(self, testset, fieldnames=['output_class'],input='input',target='target'): """Apply this model (as a function) to new data""" #TODO: cache fn between calls - assert 'input' == testset.fieldNames()[0] + assert input == testset.fieldNames()[0] # why first one??? assert len(testset.fieldNames()) <= 2 v = self.algo.v outputs = [getattr(v, name) for name in fieldnames] - inputs = [v.input] + ([v.target] if 'target' in testset else []) + inputs = [v.input] + ([v.target] if target in testset else []) inputs.extend(v.params) theano_fn = _cache(self._fn_cache, (tuple(inputs), tuple(outputs)), lambda: self.algo._fn(inputs, outputs)) @@ -102,7 +118,7 @@ # prefer caching in _Model.__call__ return theano.function(inputs, outputs, unpack_single=False, linker=self.linker) - def __call__(self, trainset=None, iparams=None): + def __call__(self, trainset=None, iparams=None, input='input', target='target'): """Allocate and optionally train a model""" if iparams is None: iparams = [_randshape(self.nhid, self.nclass), _randshape(self.nclass)]\ @@ -119,8 +135,9 @@ best = rval for stp in self.early_stopper(): rval.update( - trainset.minibatches(['input', 'target'], minibatch_size=min(32, + minset.minibatches([input, target], minibatch_size=min(32, len(trainset)))) + #print 'mlp.__call__(), we did an update' if stp.set_score: stp.score = rval(valset, ['loss_01']) if (stp.score < stp.best_score): @@ -154,7 +171,7 @@ , linker='c&py' , early_stopper = lambda:stopper.NStages(100,1)) - model1 = learn_algo(training_set1) + model1 = learn_algo(training_set1,input='input',target='target') model2 = learn_algo(training_set2)
--- a/nnet_ops.py Fri May 23 10:21:52 2008 -0400 +++ b/nnet_ops.py Tue May 27 13:46:03 2008 -0400 @@ -44,7 +44,7 @@ return ScalarSoftplus.static_impl(x) def grad(self, (x,), (gz,)): return [gz * scalar_sigmoid(x)] - def c_code(self, name, node, (x,), (z,), sub): + def c_code(self, node, name, (x,), (z,), sub): if node.inputs[0].type in [scalar.float32, scalar.float64]: return """%(z)s = %(x)s < -30.0
--- a/statscollector.py Fri May 23 10:21:52 2008 -0400 +++ b/statscollector.py Tue May 27 13:46:03 2008 -0400 @@ -77,7 +77,7 @@ total_loss = regularizer+t.examplewise_sum(nll) avg_nll = t.examplewise_mean(nll) avg_class_error = t.examplewise_mean(class_error) - for name,val in locals(): val.name = name + for name,val in locals().items(): val.name = name return StatsCollector([regularizer],[nll,class_error],[total_loss,avg_nll,avg_class_error])
--- a/stopper.py Fri May 23 10:21:52 2008 -0400 +++ b/stopper.py Tue May 27 13:46:03 2008 -0400 @@ -75,6 +75,9 @@ E_set_score = 'when iter.set_score is True, caller must assign a score to iter.score' def next(self): + + #print 'ICML08 stopper, were doing a next' + if self.set_score: #left over from last time if self.score is None: raise Exception(ICML08Stopper.E_set_score)