# HG changeset patch # User Joseph Turian # Date 1225172007 14400 # Node ID 3daabc7f94ff786af5460f00212fbc231b0ba1b5 # Parent 2c4738e5e4b2c906621a4038ffbb17065cce8111 Added Yoshua's explanation diff -r 2c4738e5e4b2 -r 3daabc7f94ff cost.py --- a/cost.py Tue Oct 28 00:23:53 2008 -0400 +++ b/cost.py Tue Oct 28 01:33:27 2008 -0400 @@ -3,6 +3,8 @@ @note: All of these functions return one cost per example. So it is your job to perform a tensor.sum over the individual example losses. + +@todo: It would be nice to implement a hinge loss, with a particular margin. """ import theano.tensor as T diff -r 2c4738e5e4b2 -r 3daabc7f94ff embeddings/parameters.py --- a/embeddings/parameters.py Tue Oct 28 00:23:53 2008 -0400 +++ b/embeddings/parameters.py Tue Oct 28 01:33:27 2008 -0400 @@ -1,8 +1,10 @@ """ Locations of the embedding data files. """ -WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" -VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" +#WEIGHTSFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt" +#VOCABFILE = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc" +WEIGHTSFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/lm-weights.txt" +VOCABFILE = "/home/joseph/data/word_embeddings.collobert-and-weston/words.asc" NUMBER_OF_WORDS = 30000 DIMENSIONS = 50 UNKNOWN = "UNKNOWN" diff -r 2c4738e5e4b2 -r 3daabc7f94ff sandbox/weights.py --- a/sandbox/weights.py Tue Oct 28 00:23:53 2008 -0400 +++ b/sandbox/weights.py Tue Oct 28 01:33:27 2008 -0400 @@ -4,22 +4,37 @@ @note: We assume that numpy.random.seed() has already been performed. """ -from math import sqrt +from math import pow, sqrt import numpy.random -def random_weights(nin, nout, scale_by=sqrt(3)): + +sqrt3 = sqrt(3.0) +def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5): """ Generate an initial weight matrix with nin inputs (rows) and nout outputs (cols). Each weight is chosen uniformly at random to be in range: - [-scale_by/sqrt(nin), +scale_by/sqrt(nin)] - @note: Play with scale_by! - Ronan derives scale_by=sqrt(3) because that gives variance of - 1 to something (I forget, ask Yoshua for the derivation). However, - Ronan got better results by accidentally using scale_by=1. Yoshua - hypothesizes this is because the variance will get telescopically - smaller as we go up the layers [need more explanation of this - argument]. - @note: Things may get even trickier if the same weights are being - shared in multiple places. + [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)] + @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3 + power=0.5 is strongly recommanded (see below). + + Suppose these weights w are used in dot products as follows: + output = w' input + If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then + Var[w]=r2/3 + Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3 + To make sure that variance is not changed after the dot product, + we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d). This choice + corresponds to the default values scale_by=sqrt(3) and power=0.5. + More generally we see that Var[output] = Var[input] * scale_by. + + Now, if these are weights in a deep multi-layer neural network, + we would like the top layers to be initially more linear, so as to let + gradients flow back more easily (this is an explanation by Ronan Collobert). + To achieve this we want scale_by smaller than 1. + Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1 + in the experiment of his ICML'2008 paper. + Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity, + the variance of the layer outputs would go down roughly by a factor 'scale_by' at each + layer (making the layers more linear as we go up towards the output). """ - return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by / sqrt(nin) + return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)