# HG changeset patch # User Joseph Turian # Date 1225205666 14400 # Node ID 4f3c66146f170d5481cc4ce22a8006c92ea72252 # Parent bb6bdd3b7ff386bc61b6ec3d56f61e0cc6739d2d Moved weights.py out of sandbox diff -r bb6bdd3b7ff3 -r 4f3c66146f17 sandbox/weights.py --- a/sandbox/weights.py Tue Oct 28 03:31:36 2008 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -""" -Routine to initialize weights. - -@note: We assume that numpy.random.seed() has already been performed. -""" - -from math import pow, sqrt -import numpy.random - -sqrt3 = sqrt(3.0) -def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5): - """ - Generate an initial weight matrix with nin inputs (rows) and nout - outputs (cols). - Each weight is chosen uniformly at random to be in range: - [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)] - @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3 - power=0.5 is strongly recommanded (see below). - - Suppose these weights w are used in dot products as follows: - output = w' input - If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then - Var[w]=r2/3 - Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3 - To make sure that variance is not changed after the dot product, - we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d). This choice - corresponds to the default values scale_by=sqrt(3) and power=0.5. - More generally we see that Var[output] = Var[input] * scale_by. - - Now, if these are weights in a deep multi-layer neural network, - we would like the top layers to be initially more linear, so as to let - gradients flow back more easily (this is an explanation by Ronan Collobert). - To achieve this we want scale_by smaller than 1. - Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1 - in the experiment of his ICML'2008 paper. - Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity, - the variance of the layer outputs would go down roughly by a factor 'scale_by' at each - layer (making the layers more linear as we go up towards the output). - """ - return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power) diff -r bb6bdd3b7ff3 -r 4f3c66146f17 weights.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/weights.py Tue Oct 28 10:54:26 2008 -0400 @@ -0,0 +1,40 @@ +""" +Routine to initialize weights. + +@note: We assume that numpy.random.seed() has already been performed. +""" + +from math import pow, sqrt +import numpy.random + +sqrt3 = sqrt(3.0) +def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5): + """ + Generate an initial weight matrix with nin inputs (rows) and nout + outputs (cols). + Each weight is chosen uniformly at random to be in range: + [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)] + @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3 + power=0.5 is strongly recommanded (see below). + + Suppose these weights w are used in dot products as follows: + output = w' input + If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then + Var[w]=r2/3 + Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3 + To make sure that variance is not changed after the dot product, + we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d). This choice + corresponds to the default values scale_by=sqrt(3) and power=0.5. + More generally we see that Var[output] = Var[input] * scale_by. + + Now, if these are weights in a deep multi-layer neural network, + we would like the top layers to be initially more linear, so as to let + gradients flow back more easily (this is an explanation by Ronan Collobert). + To achieve this we want scale_by smaller than 1. + Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1 + in the experiment of his ICML'2008 paper. + Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity, + the variance of the layer outputs would go down roughly by a factor 'scale_by' at each + layer (making the layers more linear as we go up towards the output). + """ + return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)