Mercurial > pylearn
changeset 791:166a89917669
changed behavior of DAAig and StackedDAAig, added a cost scaling parameter
author | Xavier Glorot <glorotxa@iro.umontreal.ca> |
---|---|
date | Fri, 10 Jul 2009 16:56:52 -0400 |
parents | d98117100166 |
children | 961dc1a7921b |
files | pylearn/algorithms/sandbox/DAA_inputs_groups.py |
diffstat | 1 files changed, 123 insertions(+), 102 deletions(-) [+] |
line wrap: on
line diff
--- a/pylearn/algorithms/sandbox/DAA_inputs_groups.py Fri Jul 10 12:16:20 2009 -0400 +++ b/pylearn/algorithms/sandbox/DAA_inputs_groups.py Fri Jul 10 16:56:52 2009 -0400 @@ -10,8 +10,8 @@ from pylearn.algorithms.logistic_regression import LogRegN -# used to initialize containers -class ScratchPad: +# Initialize containers: +class CreateContainer: pass # regularisation utils:------------------------------------------- @@ -19,8 +19,7 @@ if type == 'l1': return T.sum(T.abs(param)) if type == 'l2': - return T.sum(param*param)#faster... - return T.sum(T.pow(param,2)) + return T.sum(param*param) raise NotImplementedError('Only l1 and l2 regularization are currently implemented') def get_reg_cost(params, type): @@ -33,14 +32,13 @@ def sigmoid_act(x): return theano.tensor.nnet.sigmoid(x) +#tanh is scaled by 2 to have the same gradient than sigmoid [sigmoid(x)=(tanh(x/2.0)+1)/2.0] def tanh_act(x): return theano.tensor.tanh(x/2.0) # costs utils:--------------------------------------------------- - # in order to fix numerical instability of the cost and gradient calculation for the cross entropy we calculate it # with the following functions direclty from the activation: - def sigmoid_cross_entropy(target, output_act, mean_axis, sum_axis): XE =-target * T.log(1 + T.exp(-output_act)) + (1 - target) * (- T.log(1 + T.exp(output_act))) return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis) @@ -60,25 +58,30 @@ def quadratic(target, output, act, axis = 1): return pylearn.algorithms.cost.quadratic(target, output, axis) - - # DAAig module---------------------------------------------------------------- class DAAig(module.Module): - """De-noising Auto-encoder + """De-noising Auto-encoder with inputs groups and missing values """ def __init__(self, input = None, auxinput = None, in_size=None, auxin_size= None, n_hid=1, - regularize = False, tie_weights = False, hid_fn = 'sigmoid_act', - reconstruction_cost_function='cross_entropy', interface = True, - ignore_missing=None, reconstruct_missing=False, - corruption_pattern=None, - **init): + regularize = False, tie_weights = False, hid_fn = 'tanh_act', + rec_fn = 'tanh_act',reconstruction_cost_function='cross_entropy', + scale_cost = False, interface = True, ignore_missing=None, reconstruct_missing=False, + corruption_pattern=None, **init): """ + :param input: WRITEME + :param auxinput: WRITEME + :param in_size: WRITEME + :param auxin_size: WRITEME + :param n_hid: WRITEME :param regularize: WRITEME :param tie_weights: WRITEME :param hid_fn: WRITEME - :param reconstruction_cost: Should return one cost per example (row) + :param rec_fn: WRITEME + :param reconstruction_cost_function: WRITEME + :param scale_cost: WRITEME + :param interface: WRITEME :param ignore_missing: if not None, the input will be scanned in order to detect missing values, and these values will be replaced. Also, the reconstruction cost's gradient will be computed only on non @@ -93,7 +96,7 @@ in the current implementation, auxiliary inputs cannot be used when this option is True. :param corruption_pattern: if not None, may specify a particular way to - corrupt the input with missing values. Valid choices are: + corrupt the input with missing values. Valid choices are: - 'by_pair': consider that features are given as pairs, and corrupt (or not) the whole pair instead of considering them independently. Elements in a pair are not consecutive, instead they are assumed to @@ -108,6 +111,12 @@ print '\t\tin_size = ', in_size print '\t\tauxin_size = ', auxin_size print '\t\tn_hid = ', n_hid + print '\t\tregularize = ', regularize + print '\t\ttie_weights = ', tie_weights + print '\t\thid_fn = ', hid_fn + print '\t\trec_fn = ', rec_fn + print '\t\treconstruction_cost_function = ', reconstruction_cost_function + print '\t\tscale_cost = ', scale_cost super(DAAig, self).__init__() self.random = T.RandomStreams() @@ -122,11 +131,14 @@ self.ignore_missing = ignore_missing self.reconstruct_missing = reconstruct_missing self.corruption_pattern = corruption_pattern - + self.scale_cost = scale_cost assert hid_fn in ('sigmoid_act','tanh_act') self.hid_fn = eval(hid_fn) - self.hid_name = hid_fn + + assert rec_fn in ('sigmoid_act','tanh_act') + self.rec_fn = eval(rec_fn) + self.rec_name = rec_fn assert reconstruction_cost_function in ('cross_entropy','quadratic') self.reconstruction_cost_function = eval(reconstruction_cost_function) @@ -134,13 +146,13 @@ ### DECLARE MODEL VARIABLES and default self.input = input + self.noisy_input = None if self.ignore_missing is not None and self.input is not None: no_missing = FillMissing(self.ignore_missing)(self.input) self.input = no_missing[0] # With missing values replaced. self.input_missing_mask = no_missing[1] # Missingness pattern. else: self.input_missing_mask = None - self.noisy_input = None self.auxinput = auxinput self.idx_list = T.ivector('idx_list') if self.auxinput is not None else None self.noisy_idx_list, self.noisy_auxinput = None, None @@ -160,6 +172,7 @@ #hyper-parameters if self.interface: self.lr = T.scalar('lr') + self.noise_level = T.scalar('noise_level') self.noise_level_group = T.scalar('noise_level_group') @@ -174,29 +187,31 @@ self.noisy_input = self.corrupt_input() if self.auxinput is not None: self.noisy_idx_list , self.noisy_auxinput = \ - scannoise(self.idx_list, self.auxinput,self.noise_level, - self.noise_level_group) + scannoise(self.idx_list, self.auxinput,self.noise_level, self.noise_level_group) - self.noise = ScratchPad() - self.clean = ScratchPad() + self.noise = CreateContainer() + self.clean = CreateContainer() self.define_behavioural(self.clean, self.input, self.idx_list, self.auxinput) self.define_behavioural(self.noise, self.noisy_input, self.noisy_idx_list, self.noisy_auxinput) self.define_regularization() # call before cost - self.define_cost(self.clean) - self.define_cost(self.noise) + self.define_cost(self.noise) # the cost is only needed for the noise (not used for the clean part) self.define_params() if self.interface: self.define_gradients() self.define_interface() - + def define_behavioural(self, container, input, idx_list, auxinput): self.define_propup(container, input, idx_list , auxinput) container.hidden = self.hid_fn(container.hidden_activation) self.define_propdown(container, idx_list , auxinput) - container.rec = self.hid_fn(container.rec_activation) - + if self.input is not None: + container.rec_in = self.rec_fn(container.rec_activation_in) + if (self.auxinput is not None): + container.rec_aux = self.rec_fn(container.rec_activation_aux) + container.rec = self.rec_fn(container.rec_activation) + def define_propup(self, container, input, idx_list, auxinput): if self.input is not None: container.hidden_activation = self.filter_up(input, self.wenc, self.benc) @@ -205,36 +220,33 @@ else: if self.auxinput is not None: container.hidden_activation = scandotenc(idx_list,auxinput,self.wauxenc) + self.benc - - # DEPENDENCY: define_propup + def define_propdown(self, container, idx_list, auxinput): if self.input is not None: - rec_activation1 = self.filter_down(container.hidden,self.wdec,self.bdec) + container.rec_activation_in = self.filter_down(container.hidden,self.wdec,self.bdec) if self.auxinput is not None: - rec_activation2 = scandotdec(idx_list,auxinput,container.hidden,self.wauxdec) +\ + container.rec_activation_aux = scandotdec(idx_list,auxinput,container.hidden,self.wauxdec) +\ scanbiasdec(idx_list,auxinput,self.bauxdec) + if (self.ignore_missing is not None and self.input is not None and not self.reconstruct_missing): + # Apply mask to gradient to ensure we do not backpropagate on the + # cost computed on missing inputs (that have been imputed). + container.rec_activation_in = mask_gradient(container.rec_activation_in, + self.input_missing_mask) + if (self.input is not None) and (self.auxinput is not None): - container.rec_activation = T.join(1,rec_activation1,rec_activation2) + container.rec_activation = T.join(1,container.rec_activation_in,container.rec_activation_aux) else: if self.input is not None: - container.rec_activation = rec_activation1 - else: - container.rec_activation = rec_activation2 - - if (self.ignore_missing is not None and self.input is not None and not - self.reconstruct_missing): - # Apply mask to gradient to ensure we do not backpropagate on the - # cost computed on missing inputs (that have been imputed). - container.rec_activation = mask_gradient(container.rec_activation, - self.input_missing_mask) - + container.rec_activation = container.rec_activation_in + if (self.auxinput is not None): + container.rec_activation = container.rec_activation_aux + def filter_up(self, vis, w, b=None): out = T.dot(vis, w) return out + b if b else out filter_down = filter_up - # TODO: fix regularization type (outside parameter ?) def define_regularization(self): self.reg_coef = T.scalar('reg_coef') if self.auxinput is not None: @@ -255,20 +267,38 @@ self.regularization = self.reg_coef * get_reg_cost(listweights,'l2') self.regularizationenc = self.reg_coef * get_reg_cost(listweightsenc,'l2') - - # DEPENDENCY: define_behavioural, define_regularization def define_cost(self, container): if self.reconstruction_cost_function_name == 'cross_entropy': - container.reconstruction_cost = self.reconstruction_costs(container.rec_activation) + if (self.input is not None): + container.reconstruction_cost_in = \ + self.reconstruction_cost_function(self.input,container.rec_activation_in,self.rec_name) + if (self.auxinput is not None): + container.reconstruction_cost_aux = \ + self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput),container.rec_activation_aux,\ + self.rec_name) else: - container.reconstruction_cost = self.reconstruction_costs(container.rec) + if (self.input is not None): + container.reconstruction_cost_in = \ + self.reconstruction_cost_function(self.input,container.rec_in,self.rec_name) + if (self.auxinput is not None): + container.reconstruction_cost_aux = \ + self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput),container.rec_aux,\ + self.rec_name) # TOTAL COST + if (self.input is not None) and (self.auxinput is not None): + container.reconstruction_cost = (T.constant(min(1,1+self.scale_cost)) *container.reconstruction_cost_in +\ + T.constant(min(1,1-self.scale_cost)) * container.reconstruction_cost_aux ) + else: + if self.input is not None: + container.reconstruction_cost = container.reconstruction_cost_in + if (self.auxinput is not None): + container.reconstruction_cost = container.reconstruction_cost_aux + if self.regularize: #if stacked don't merge regularization and cost here but in the stackeddaaig module container.cost = container.reconstruction_cost + self.regularization else: container.cost = container.reconstruction_cost - # DEPENDENCY: define_cost def define_params(self): if not hasattr(self,'params'): self.params = [] @@ -286,14 +316,11 @@ if self.auxinput is not None: self.params += self.wauxdec - # DEPENDENCY: define_cost, define_gradients def define_gradients(self): self.gradients = T.grad(self.noise.cost, self.params) self.updates = dict((p, p - self.lr * g) for p, g in \ zip(self.params, self.gradients)) - - # DEPENDENCY: define_behavioural, define_regularization, define_cost, define_gradients def define_interface(self): # declare function to interface with module (if not stacked) if self.input is None: @@ -310,7 +337,6 @@ self.auxnoisify = theano.Method(listin, self.noisy_auxinput) self.reconstruction = theano.Method(listin, self.clean.rec) self.representation = theano.Method(listin, self.clean.hidden) - self.validate = theano.Method(listin, [self.clean.cost, self.clean.rec]) def corrupt_input(self): if self.corruption_pattern is None: @@ -327,17 +353,6 @@ % self.corruption_pattern) return mask * self.input - def reconstruction_costs(self, rec): - if (self.input is not None) and (self.auxinput is not None): - return self.reconstruction_cost_function(T.join(1,self.input,scaninputs(self.idx_list,self.auxinput)),\ - rec, self.hid_name) - if self.input is not None: - return self.reconstruction_cost_function(self.input, rec, self.hid_name) - if self.auxinput is not None: - return self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput), rec, self.hid_name) - # All cases should be covered above. If not, something is wrong! - assert False - def _instance_initialize(self, obj, lr = 1 , reg_coef = 0, noise_level = 0 , noise_level_group = 0, seed=1, alloc=True, **init): super(DAAig, self)._instance_initialize(obj, **init) @@ -365,14 +380,12 @@ self.inf = 1/numpy.sqrt(sum(self.auxin_size)+self.in_size) self.hif = 1/numpy.sqrt(self.n_hid) - if alloc: if self.input is not None: wencshp = (self.in_size, self.n_hid) wdecshp = tuple(reversed(wencshp)) print 'wencshp = ', wencshp print 'wdecshp = ', wdecshp - obj.wenc = self.R.uniform(size=wencshp, low = -self.inf, high = self.inf) if not(self.tie_weights): obj.wdec = self.R.uniform(size=wdecshp, low=-self.hif, high=self.hif) @@ -383,7 +396,6 @@ wauxdecshp = [tuple(reversed(i)) for i in wauxencshp] print 'wauxencshp = ', wauxencshp print 'wauxdecshp = ', wauxdecshp - obj.wauxenc = [self.R.uniform(size=i, low = -self.inf, high = self.inf) for i in wauxencshp] if not(self.tie_weights): obj.wauxdec = [self.R.uniform(size=i, low=-self.hif, high=self.hif) for i in wauxdecshp] @@ -400,20 +412,21 @@ class StackedDAAig(module.Module): def __init__(self, depth = 1, input = T.dmatrix('input'), auxinput = [None], in_size = None, auxin_size = [None], n_hid = [1], - regularize = False, tie_weights = False, hid_fn = 'sigmoid_act', - reconstruction_cost_function='cross_entropy', + regularize = False, tie_weights = False, hid_fn = 'tanh_act', + rec_fn = 'tanh_act',reconstruction_cost_function='cross_entropy', scale_cost=False, n_out = 2, target = None, debugmethod = False, totalupdatebool=False, ignore_missing=None, reconstruct_missing=False, corruption_pattern=None, **init): super(StackedDAAig, self).__init__() - print '\t**** StackedDAAig.__init__ ****' - print '\tinput = ', input - print '\tauxinput = ', auxinput - print '\tin_size = ', in_size - print '\tauxin_size = ', auxin_size - print '\tn_hid = ', n_hid + + # utils + def listify(param,depth): + if type(param) is list: + return param if len(param)==depth else [param[0]]*depth + else: + return [param]*depth # save parameters self.depth = depth @@ -421,11 +434,13 @@ self.auxinput = auxinput self.in_size = in_size auxin_size = auxin_size - self.n_hid = n_hid + self.n_hid = listify(n_hid,depth) self.regularize = regularize - self.tie_weights = tie_weights - self.hid_fn = hid_fn - self.reconstruction_cost_function = reconstruction_cost_function + self.tie_weights = listify(tie_weights,depth) + self.hid_fn = listify(hid_fn,depth) + self.rec_fn = listify(rec_fn,depth) + self.reconstruction_cost_function = listify(reconstruction_cost_function,depth) + self.scale_cost = listify(scale_cost,depth) self.n_out = n_out self.target = target if target is not None else T.lvector('target') self.debugmethod = debugmethod @@ -434,6 +449,21 @@ self.reconstruct_missing = reconstruct_missing self.corruption_pattern = corruption_pattern + print '\t**** StackedDAAig.__init__ ****' + print '\tdepth = ', self.depth + print '\tinput = ', self.input + print '\tauxinput = ', self.auxinput + print '\tin_size = ', self.in_size + print '\tauxin_size = ', auxin_size + print '\tn_hid = ', self.n_hid + print '\tregularize = ', self.regularize + print '\ttie_weights = ', self.tie_weights + print '\thid_fn = ', self.hid_fn + print '\trec_fn = ', self.rec_fn + print '\treconstruction_cost_function = ', self.reconstruction_cost_function + print '\tscale_cost = ', self.scale_cost + print '\tn_out = ', self.n_out + # init for model construction inputprec = input in_sizeprec = in_size @@ -448,14 +478,11 @@ self.globalupdate = [None] * (self.depth+1)#update wrt the layer cost backproped untill the input layer if self.totalupdatebool: self.totalupdate = [None] * (self.depth+1) #update wrt all the layers cost backproped untill the input layer - # - self.classify = None - #others methods + # facultative methods if self.debugmethod: self.representation = [None] * (self.depth) self.reconstruction = [None] * (self.depth) - self.validate = [None] * (self.depth) self.noisyinputs = [None] * (self.depth) self.compute_localcost = [None] * (self.depth+1) self.compute_localgradients = [None] * (self.depth+1) @@ -464,7 +491,6 @@ if self.totalupdatebool: self.compute_totalcost = [None] * (self.depth+1) self.compute_totalgradients = [None] * (self.depth+1) - # # some theano Variables we want to keep track on if self.regularize: @@ -481,7 +507,6 @@ paramstot = [] paramsenc = [] self.inputs = [None] * (self.depth+1) - if self.input is not None: self.inputs[0] = [self.input] else: @@ -490,20 +515,19 @@ offset = 0 for i in range(self.depth): + dict_params = dict(input = inputprec, in_size = in_sizeprec, auxin_size = auxin_size[i], + n_hid = self.n_hid[i], regularize = False, tie_weights = self.tie_weights[i], hid_fn = self.hid_fn[i], + rec_fn = self.rec_fn[i], reconstruction_cost_function = self.reconstruction_cost_function[i], + scale_cost = self.scale_cost[i], interface = False, ignore_missing = self.ignore_missing, + reconstruct_missing = self.reconstruct_missing,corruption_pattern = self.corruption_pattern) if auxin_size[i] is None: offset +=1 - param = [inputprec, None, in_sizeprec, auxin_size[i], self.n_hid[i],\ - False, self.tie_weights, self.hid_fn, self.reconstruction_cost_function,False] + dict_params.update({'auxinput' : None}) else: - param = [inputprec, self.auxinput[i-offset], in_sizeprec, auxin_size[i], self.n_hid[i],\ - False, self.tie_weights, self.hid_fn, self.reconstruction_cost_function,False] - - dict_params = dict(ignore_missing = self.ignore_missing, - reconstruct_missing = self.reconstruct_missing, - corruption_pattern = self.corruption_pattern) + dict_params.update({'auxinput' : self.auxinput[i-offset]}) print '\tLayer init= ', i+1 - self.daaig[i] = DAAig(*param, **dict_params) + self.daaig[i] = DAAig(**dict_params) # method input, outputs and parameters update if i: @@ -533,7 +557,6 @@ self.regularizationenccost[i] = self.regularizationenccost[i-1]+self.daaig[i-1].regularizationenc else: self.regularizationenccost[i] = 0 - self.localcost[i] += self.daaig[i].regularization self.globalcost[i] += self.regularizationenccost[i] if self.totalupdatebool: @@ -556,11 +579,10 @@ self.globalupdate[i] = theano.Method(self.inputs[i],self.globalcost[i],global_grads) if self.totalupdatebool: self.totalupdate[i] = theano.Method(self.inputs[i],self.totalcost[i],total_grads) - # + if self.debugmethod: self.representation[i] = theano.Method(self.inputs[i],self.daaig[i].clean.hidden_activation) self.reconstruction[i] = theano.Method(self.inputs[i],self.daaig[i].clean.rec) - self.validate[i] =theano.Method(self.inputs[i], [self.daaig[i].clean.cost, self.daaig[i].clean.rec]) self.noisyinputs[i] =theano.Method(self.inputs[i], noisyout) self.compute_localcost[i] = theano.Method(self.inputs[i],self.localcost[i]) self.compute_localgradients[i] = theano.Method(self.inputs[i],self.localgradients[i]) @@ -569,13 +591,12 @@ if self.totalupdatebool: self.compute_totalcost[i] = theano.Method(self.inputs[i],self.totalcost[i]) self.compute_totalgradients[i] = theano.Method(self.inputs[i],self.totalgradients[i]) - # paramsenc += self.daaig[i].paramsenc inputprec = self.daaig[i].clean.hidden in_sizeprec = self.n_hid[i] - # supervised layer + # supervised layer------------------------------------------------------------------------ print '\tLayer supervised init' self.inputs[-1] = copy.copy(self.inputs[-2])+[self.target] self.daaig[-1] = LogRegN(in_sizeprec,self.n_out,sigmoid_act(self.daaig[-2].clean.hidden_activation),self.target) @@ -608,7 +629,7 @@ self.globalupdate[-1] = theano.Method(self.inputs[-1],self.globalcost[-1],global_grads) if self.totalupdatebool: self.totalupdate[-1] = theano.Method(self.inputs[-1],self.totalcost[-1],total_grads) - + totallocal_grads={} for k in range(self.depth): totallocal_grads.update(dict((j, j - self.unsup_lr * g) for j,g in @@ -616,9 +637,9 @@ totallocal_grads.update(dict((j, j - self.sup_lr * g) for j,g in zip(self.daaig[-1].params,self.localgradients[-1]))) self.totallocalupdate = theano.Method(self.inputs[-1],self.localcost,totallocal_grads) + # interface for the user self.classify = theano.Method(self.inputs[-2],self.daaig[-1].argmax_standalone) self.NLL = theano.Method(self.inputs[-1],self.daaig[-1]._xent) - if self.debugmethod: self.compute_localcost[-1] = theano.Method(self.inputs[-1],self.localcost[-1])