# HG changeset patch # User James Bergstra # Date 1243988495 14400 # Node ID 390d8c5a1fee04b81bee86dd2e0f8255571fd268 # Parent 0eee6693f149ed67cdf5c51fad432c2c28a14b16# Parent 84d22b7d835aa2626b9a10f2f780fe62cc41454b merge diff -r 0eee6693f149 -r 390d8c5a1fee pylearn/algorithms/sandbox/DAA_inputs_groups.py --- a/pylearn/algorithms/sandbox/DAA_inputs_groups.py Tue Jun 02 20:21:21 2009 -0400 +++ b/pylearn/algorithms/sandbox/DAA_inputs_groups.py Tue Jun 02 20:21:35 2009 -0400 @@ -1,18 +1,20 @@ import numpy import theano -import os, copy +import copy from theano import tensor as T from theano.compile import module -from theano.tensor.nnet import sigmoid from pylearn.sandbox.scan_inputs_groups import scaninputs, scandotdec, scandotenc, scannoise, scanbiasdec, \ - scanmaskenc,scanmaskdec, fill_missing_with_zeros, mask_gradient + scanmaskenc,scanmaskdec, FillMissing, mask_gradient -from pylearn.algorithms import cost from pylearn.algorithms.logistic_regression import LogRegN +# used to initialize containers +class ScratchPad: + pass +# regularisation utils:------------------------------------------- def lnorm(param, type='l2'): if type == 'l1': return T.sum(T.abs(param)) @@ -26,19 +28,40 @@ rcost += lnorm(param, type) return rcost - +# activations utils:---------------------------------------------- def sigmoid_act(x): return theano.tensor.nnet.sigmoid(x) def tanh_act(x): - return (theano.tensor.tanh((x-0.5)*2))/2.0+0.5 + return theano.tensor.tanh(x) + +# costs utils:--------------------------------------------------- + +# in order to fix numerical instability of the cost and gradient calculation for the cross entropy we calculate it +# with the following functions direclty from the activation: + +def sigmoid_cross_entropy(target, output_act, mean_axis, sum_axis): + XE =-target * T.log(1 + T.exp(-output_act)) + (1 - target) * (- T.log(1 + T.exp(output_act))) + return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis) -def softsign_act(x): - return theano.sandbox.softsign.softsign(x) +def tanh_cross_entropy(target, output_act, mean_axis, sum_axis): + XE =-(target+1)/2.0 * T.log(1 + T.exp(-2 * output_act)) + \ + (1 - (target+1)/2.0) * (- T.log(1 + T.exp(2 * output_act))) + return -T.mean(T.sum(XE, axis=sum_axis),axis=mean_axis) -class ScratchPad: - pass +def cross_entropy(target, output_act, act, mean_axis=0, sum_axis=1): + if act == 'sigmoid_act': + return sigmoid_cross_entropy(target, output_act, mean_axis, sum_axis) + if act == 'tanh_act': + return tanh_cross_entropy(target, output_act, mean_axis, sum_axis) + assert False +def quadratic(target, output, act, axis = 1): + return pylearn.algorithms.cost.quadratic(target, output, axis) + + + +# DAAig module---------------------------------------------------------------- class DAAig(module.Module): """De-noising Auto-encoder """ @@ -46,24 +69,36 @@ def __init__(self, input = None, auxinput = None, in_size=None, auxin_size= None, n_hid=1, regularize = False, tie_weights = False, hid_fn = 'sigmoid_act', - reconstruction_cost_function=cost.cross_entropy, interface = True, - ignore_missing=False, + reconstruction_cost_function='cross_entropy', interface = True, + ignore_missing=None, reconstruct_missing=False, + corruption_pattern=None, **init): """ :param regularize: WRITEME :param tie_weights: WRITEME :param hid_fn: WRITEME :param reconstruction_cost: Should return one cost per example (row) - :param ignore_missing: if True, the input will be scanned in order to - detect missing values, and these values will be replaced by zeros. - Also, the reconstruction cost's gradient will be computed only on - non missing components. - If False, the presence of missing values may cause crashes or other + :param ignore_missing: if not None, the input will be scanned in order + to detect missing values, and these values will be replaced. Also, + the reconstruction cost's gradient will be computed only on non + missing components. The value of this parameter indicates how to + replace missing values: + - some numpy.ndarray: value of this array at the same index + - a constant: this same value everywhere + If None, the presence of missing values may cause crashes or other weird and unexpected behavior. Please note that this option only affects the permanent input, not auxilary ones (that should never contain missing values). In fact, in the current implementation, auxiliary inputs cannot be used when this option is True. + :param corruption_pattern: if not None, may specify a particular way to + corrupt the input with missing values. Valid choices are: + - 'by_pair': consider that features are given as pairs, and corrupt + (or not) the whole pair instead of considering them independently. + Elements in a pair are not consecutive, instead they are assumed to + be at distance (total number of features / 2) of each other. + :param reconstruct_missing: if True, then the reconstruction cost on + missing inputs will be backpropagated. Otherwise, it will not. :todo: Default noise level for all daa levels """ print '\t\t**** DAAig.__init__ ****' @@ -82,18 +117,25 @@ self.n_hid = n_hid self.regularize = regularize self.tie_weights = tie_weights - self.reconstruction_cost_function = reconstruction_cost_function self.interface = interface self.ignore_missing = ignore_missing + self.reconstruct_missing = reconstruct_missing + self.corruption_pattern = corruption_pattern - assert hid_fn in ('sigmoid_act','tanh_act','softsign_act') + + assert hid_fn in ('sigmoid_act','tanh_act') self.hid_fn = eval(hid_fn) + self.hid_name = hid_fn + + assert reconstruction_cost_function in ('cross_entropy','quadratic') + self.reconstruction_cost_function = eval(reconstruction_cost_function) + self.reconstruction_cost_function_name = reconstruction_cost_function ### DECLARE MODEL VARIABLES and default self.input = input - if self.ignore_missing and self.input is not None: - no_missing = fill_missing_with_zeros(self.input) - self.input = no_missing[0] # Missing values replaced by zeros. + if self.ignore_missing is not None and self.input is not None: + no_missing = FillMissing(self.ignore_missing)(self.input) + self.input = no_missing[0] # With missing values replaced. self.input_missing_mask = no_missing[1] # Missingness pattern. else: self.input_missing_mask = None @@ -131,7 +173,8 @@ self.noisy_input = self.corrupt_input() if self.auxinput is not None: self.noisy_idx_list , self.noisy_auxinput = \ - scannoise(self.idx_list,self.auxinput,self.noise_level,self.noise_level_group) + scannoise(self.idx_list, self.auxinput,self.noise_level, + self.noise_level_group) self.noise = ScratchPad() self.clean = ScratchPad() @@ -152,7 +195,8 @@ container.hidden = self.hid_fn(container.hidden_activation) self.define_propdown(container, idx_list , auxinput) container.rec = self.hid_fn(container.rec_activation) - if self.ignore_missing and self.input is not None: + if (self.ignore_missing is not None and self.input is not None and not + self.reconstruct_missing): # Apply mask to gradient to ensure we do not backpropagate on the # cost computed on missing inputs (that were replaced with zeros). container.rec = mask_gradient(container.rec, @@ -212,11 +256,15 @@ # DEPENDENCY: define_behavioural, define_regularization def define_cost(self, container): - container.reconstruction_cost = self.reconstruction_costs(container.rec) + if self.reconstruction_cost_function_name == 'cross_entropy': + container.reconstruction_cost = self.reconstruction_costs(container.rec_activation) + else: + container.reconstruction_cost = self.reconstruction_costs(container.rec) # TOTAL COST - container.cost = container.reconstruction_cost if self.regularize: #if stacked don't merge regularization and cost here but in the stackeddaaig module container.cost = container.cost + self.regularization + else: + container.cost = container.reconstruction_cost # DEPENDENCY: define_cost def define_params(self): @@ -263,15 +311,28 @@ self.validate = theano.Method(listin, [self.clean.cost, self.clean.rec]) def corrupt_input(self): - return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input + if self.corruption_pattern is None: + mask = self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) + elif self.corruption_pattern == 'by_pair': + shape = T.shape(self.input) + scale = numpy.ones(2) + scale[1] = 2 + shape = shape / scale + mask = self.random.binomial(shape, 1, 1 - self.noise_level) + mask = T.hstack((mask, mask)) + else: + raise ValueError('Unknown value for corruption_pattern: %s' + % self.corruption_pattern) + return mask * self.input def reconstruction_costs(self, rec): if (self.input is not None) and (self.auxinput is not None): - return self.reconstruction_cost_function(T.join(1,self.input,scaninputs(self.idx_list,self.auxinput)), rec) + return self.reconstruction_cost_function(T.join(1,self.input,scaninputs(self.idx_list,self.auxinput)),\ + rec, self.hid_name) if self.input is not None: - return self.reconstruction_cost_function(self.input, rec) + return self.reconstruction_cost_function(self.input, rec, self.hid_name) if self.auxinput is not None: - return self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput), rec) + return self.reconstruction_cost_function(scaninputs(self.idx_list,self.auxinput), rec, self.hid_name) # All cases should be covered above. If not, something is wrong! assert False @@ -338,9 +399,10 @@ def __init__(self, depth = 1, input = T.dmatrix('input'), auxinput = [None], in_size = None, auxin_size = [None], n_hid = [1], regularize = False, tie_weights = False, hid_fn = 'sigmoid_act', - reconstruction_cost_function=cost.cross_entropy, + reconstruction_cost_function='cross_entropy', n_out = 2, target = None, debugmethod = False, totalupdatebool=False, - ignore_missing=False, + ignore_missing=None, reconstruct_missing=False, + corruption_pattern=None, **init): super(StackedDAAig, self).__init__() @@ -367,6 +429,8 @@ self.debugmethod = debugmethod self.totalupdatebool = totalupdatebool self.ignore_missing = ignore_missing + self.reconstruct_missing = reconstruct_missing + self.corruption_pattern = corruption_pattern # init for model construction inputprec = input @@ -432,7 +496,9 @@ param = [inputprec, self.auxinput[i-offset], in_sizeprec, auxin_size[i], self.n_hid[i],\ False, self.tie_weights, self.hid_fn, self.reconstruction_cost_function,False] - dict_params = dict(ignore_missing = self.ignore_missing) + dict_params = dict(ignore_missing = self.ignore_missing, + reconstruct_missing = self.reconstruct_missing, + corruption_pattern = self.corruption_pattern) print '\tLayer init= ', i+1 self.daaig[i] = DAAig(*param, **dict_params) diff -r 0eee6693f149 -r 390d8c5a1fee pylearn/algorithms/sgd.py --- a/pylearn/algorithms/sgd.py Tue Jun 02 20:21:21 2009 -0400 +++ b/pylearn/algorithms/sgd.py Tue Jun 02 20:21:35 2009 -0400 @@ -69,7 +69,7 @@ :returns: standard minimizer constructor f(args, cost, params, gradient=None) """ - def f(args, cost, params, gradient=None, updates=None, auxout=None): - return StochasticGradientDescent(args, cost, params, gradient, stepsize, + def f(args, cost, params, gradients=None, updates=None, auxout=None): + return StochasticGradientDescent(args, cost, params, gradients=gradients, stepsize=stepsize, updates=updates, auxout=auxout) return f diff -r 0eee6693f149 -r 390d8c5a1fee pylearn/algorithms/stopper.py --- a/pylearn/algorithms/stopper.py Tue Jun 02 20:21:21 2009 -0400 +++ b/pylearn/algorithms/stopper.py Tue Jun 02 20:21:35 2009 -0400 @@ -1,3 +1,4 @@ +import time """Early stopping iterators The idea here is to supply early-stopping heuristics that can be used in the @@ -65,12 +66,14 @@ return ICML08Stopper(30*ntrain/batchsize, ntrain/batchsize, 0.96, 2.0, 100000000) - def __init__(self, i_wait, v_int, min_improvement, patience, hard_limit): + def __init__(self, i_wait, v_int, min_improvement, patience, hard_limit, hard_time_limit=None): self.initial_wait = i_wait self.set_score_interval = v_int self.min_improvement = min_improvement self.patience = patience self.hard_limit = hard_limit + self.hard_limit_seconds = hard_time_limit + self.start_time = time.time() self.best_score = float('inf') self.best_iter = -1 @@ -97,7 +100,8 @@ starting = self.iter < self.initial_wait waiting = self.iter < (self.patience * self.best_iter) - if starting or waiting: + times_up = (time.time() - self.start_time) > self.hard_limit_seconds if self.hard_limit_seconds != None else False + if (starting or waiting) and not times_up: # continue to iterate self.iter += 1 if self.iter == self.hard_limit: diff -r 0eee6693f149 -r 390d8c5a1fee pylearn/datasets/norb_small.py --- a/pylearn/datasets/norb_small.py Tue Jun 02 20:21:21 2009 -0400 +++ b/pylearn/datasets/norb_small.py Tue Jun 02 20:21:35 2009 -0400 @@ -68,8 +68,8 @@ path = Paths() def __init__(self, ntrain=19440, nvalid=4860, ntest=24300, - downsample_amt=1, seed=1, normalize=True, - mode='stereo', dtype='float64'): + downsample_amt=1, seed=1, normalize=False, + mode='stereo', dtype='int8'): self.n_classes = 5 self.nsamples = 24300 diff -r 0eee6693f149 -r 390d8c5a1fee pylearn/sandbox/scan_inputs_groups.py --- a/pylearn/sandbox/scan_inputs_groups.py Tue Jun 02 20:21:21 2009 -0400 +++ b/pylearn/sandbox/scan_inputs_groups.py Tue Jun 02 20:21:35 2009 -0400 @@ -567,23 +567,37 @@ """ Given an input, output two elements: - a copy of the input where missing values (NaN) are replaced by some - constant (zero by default) + other value (zero by default) - a mask of the same size and type as input, where each element is zero iff the corresponding input is missing - Currently, the gradient is computed as if the input value was really zero. - It may be safer to replace the gradient w.r.t. missing values with either - zeros or missing values (?). + The 'fill_with' parameter may either be: + - a scalar: all missing values are replaced with this value + - a Numpy array: a missing value is replaced by the value in this array + at the same position (ignoring the first k dimensions if 'fill_with' + has k less dimensions than the input) + Currently, the gradient is computed as if the input value was really what + it was replaced with. It may be safer to replace the gradient w.r.t. + missing values with either zeros or missing values (?). """ - def __init__(self, constant_val=0): + def __init__(self, fill_with=0): super(Op, self).__init__() - self.constant_val = constant_val + self.fill_with = fill_with + self.fill_with_is_array = isinstance(self.fill_with, numpy.ndarray) def __eq__(self, other): - return type(self) == type(other) and (self.constant_val == other.constant_val) + return (type(self) == type(other) and + self.fill_with_is_array == other.fill_with_is_array and + ((self.fill_with_is_array and + (self.fill_with == other.fill_with).all()) or + self.fill_with == other.fill_with)) - def __hash__(self): - return hash(type(self))^hash(self.constant_val) + def __hash__(self): + if self.fill_with_is_array: + fill_hash = self.fill_with.__hash__() + else: + fill_hash = hash(self.fill_with) + return hash(type(self))^hash(self.fill_with_is_array)^fill_hash def make_node(self, input): return Apply(self, [input], [input.type(), input.type()]) @@ -595,9 +609,15 @@ mask = output_storage[1] mask[0] = numpy.ones(input.shape) mask = mask[0] + if self.fill_with_is_array: + ignore_k = len(out.shape) - len(self.fill_with.shape) + assert ignore_k >= 0 for (idx, v) in numpy.ndenumerate(out): if numpy.isnan(v): - out[idx] = self.constant_val + if self.fill_with_is_array: + out[idx] = self.fill_with[idx[ignore_k:]] + else: + out[idx] = self.fill_with mask[idx] = 0 def grad(self, inputs, (out_grad, mask_grad, )):