# HG changeset patch # User Olivier Breuleux # Date 1225208387 14400 # Node ID 6dfdcee64e9bda256329d8c72ec29c690f5a9c31 # Parent 180d125dc7e2c01280cc832df82c3972aab534b9# Parent 4f3c66146f170d5481cc4ce22a8006c92ea72252 merge diff -r 180d125dc7e2 -r 6dfdcee64e9b algorithms/daa.py --- a/algorithms/daa.py Tue Oct 28 11:39:27 2008 -0400 +++ b/algorithms/daa.py Tue Oct 28 11:39:47 2008 -0400 @@ -4,14 +4,23 @@ from theano.tensor import nnet as NN import numpy as N +from pylearn import cost as cost + class DenoisingAA(T.RModule): - def __init__(self, input = None, regularize = True, tie_weights = True): + def __init__(self, input = None, regularize = True, tie_weights = True, + activation_function=NN.sigmoid, reconstruction_cost_function=cost.cross_entropy): + """ + @param reconstruction_cost: Should return one cost per example (row) + @todo: Default noise level for all daa levels + """ super(DenoisingAA, self).__init__() # MODEL CONFIGURATION self.regularize = regularize self.tie_weights = tie_weights + self.activation_function = activation_function + self.reconstruction_cost_function = reconstruction_cost_function # ACQUIRE/MAKE INPUT if not input: @@ -94,7 +103,7 @@ def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): if (input_size is None) ^ (hidden_size is None): - raise ValueError("Must specify hidden_size and target_size or neither.") + raise ValueError("Must specify input_size and hidden_size or neither.") super(DenoisingAA, self)._instance_initialize(obj, **init) if seed is not None: R = N.random.RandomState(seed) @@ -114,31 +123,37 @@ obj.__hide__ = ['params'] def build_regularization(self): + """ + @todo: Why do we need this function? + """ return T.zero() # no regularization! class SigmoidXEDenoisingAA(DenoisingAA): + """ + @todo: Merge this into the above. + @todo: Default noise level for all daa levels + """ def build_corrupted_input(self): self.noise_level = theano.Member(T.scalar()) return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input def hid_activation_function(self, activation): - return NN.sigmoid(activation) + return self.activation_function(activation) def out_activation_function(self, activation): - return NN.sigmoid(activation) + return self.activation_function(activation) def build_reconstruction_costs(self, output): - reconstruction_cost_matrix = -(self.input * T.log(output) + (1 - self.input) * T.log(1 - output)) - return T.sum(reconstruction_cost_matrix, axis=1) + return self.reconstruction_cost_function(self.input, output) def build_regularization(self): self.l2_coef = theano.Member(T.scalar()) if self.tie_weights: return self.l2_coef * T.sum(self.w1 * self.w1) else: - return self.l2_coef * T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2) + return self.l2_coef * (T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2)) def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init): init.setdefault('noise_level', 0) diff -r 180d125dc7e2 -r 6dfdcee64e9b algorithms/stacker.py --- a/algorithms/stacker.py Tue Oct 28 11:39:27 2008 -0400 +++ b/algorithms/stacker.py Tue Oct 28 11:39:47 2008 -0400 @@ -5,6 +5,9 @@ import numpy as N class Stacker(T.RModule): + """ + @todo: Maybe compile functions on demand, rather than immediately. + """ def __init__(self, submodules, input = None, regularize = False): super(Stacker, self).__init__() diff -r 180d125dc7e2 -r 6dfdcee64e9b cost.py --- a/cost.py Tue Oct 28 11:39:27 2008 -0400 +++ b/cost.py Tue Oct 28 11:39:47 2008 -0400 @@ -3,13 +3,15 @@ @note: All of these functions return one cost per example. So it is your job to perform a tensor.sum over the individual example losses. + +@todo: It would be nice to implement a hinge loss, with a particular margin. """ import theano.tensor as T from xlogx import xlogx def quadratic(target, output, axis=1): - return T.mean(T.sqr(target - output), axis) + return T.mean(T.sqr(target - output), axis=axis) def cross_entropy(target, output, axis=1): """ diff -r 180d125dc7e2 -r 6dfdcee64e9b embeddings/parameters.py --- a/embeddings/parameters.py Tue Oct 28 11:39:27 2008 -0400 +++ b/embeddings/parameters.py Tue Oct 28 11:39:47 2008 -0400 @@ -1,8 +1,10 @@ """ Locations of the embedding data files. """ -WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" -VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" +#WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" +#VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" +WEIGHTSFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/lm-weights.txt" +VOCABFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/words.asc" NUMBER_OF_WORDS = 30000 DIMENSIONS = 50 UNKNOWN = "UNKNOWN" diff -r 180d125dc7e2 -r 6dfdcee64e9b sandbox/weights.py --- a/sandbox/weights.py Tue Oct 28 11:39:27 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -""" -Routine to initialize weights. - -@note: We assume that numpy.random.seed() has already been performed. -""" - -from math import sqrt -import numpy.random -def random_weights(nin, nout, scale_by=sqrt(3)): - """ - Generate an initial weight matrix with nin inputs (rows) and nout - outputs (cols). - Each weight is chosen uniformly at random to be in range: - [-scale_by/sqrt(nin), +scale_by/sqrt(nin)] - @note: Play with scale_by! - Ronan derives scale_by=sqrt(3) because that gives variance of - 1 to something (I forget, ask Yoshua for the derivation). However, - Ronan got better results by accidentally using scale_by=1. Yoshua - hypothesizes this is because the variance will get telescopically - smaller as we go up the layers [need more explanation of this - argument]. - @note: Things may get even trickier if the same weights are being - shared in multiple places. - """ - return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by / sqrt(nin) diff -r 180d125dc7e2 -r 6dfdcee64e9b weights.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/weights.py Tue Oct 28 11:39:47 2008 -0400 @@ -0,0 +1,40 @@ +""" +Routine to initialize weights. + +@note: We assume that numpy.random.seed() has already been performed. +""" + +from math import pow, sqrt +import numpy.random + +sqrt3 = sqrt(3.0) +def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5): + """ + Generate an initial weight matrix with nin inputs (rows) and nout + outputs (cols). + Each weight is chosen uniformly at random to be in range: + [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)] + @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3 + power=0.5 is strongly recommanded (see below). + + Suppose these weights w are used in dot products as follows: + output = w' input + If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then + Var[w]=r2/3 + Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3 + To make sure that variance is not changed after the dot product, + we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d). This choice + corresponds to the default values scale_by=sqrt(3) and power=0.5. + More generally we see that Var[output] = Var[input] * scale_by. + + Now, if these are weights in a deep multi-layer neural network, + we would like the top layers to be initially more linear, so as to let + gradients flow back more easily (this is an explanation by Ronan Collobert). + To achieve this we want scale_by smaller than 1. + Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1 + in the experiment of his ICML'2008 paper. + Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity, + the variance of the layer outputs would go down roughly by a factor 'scale_by' at each + layer (making the layers more linear as we go up towards the output). + """ + return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)