changeset 484:3daabc7f94ff

Added Yoshua's explanation
author Joseph Turian <turian@gmail.com>
date Tue, 28 Oct 2008 01:33:27 -0400
parents 2c4738e5e4b2
children e8c37244b54f
files cost.py embeddings/parameters.py sandbox/weights.py
diffstat 3 files changed, 34 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/cost.py	Tue Oct 28 00:23:53 2008 -0400
+++ b/cost.py	Tue Oct 28 01:33:27 2008 -0400
@@ -3,6 +3,8 @@
 
 @note: All of these functions return one cost per example. So it is your
 job to perform a tensor.sum over the individual example losses.
+
+@todo: It would be nice to implement a hinge loss, with a particular margin.
 """
 
 import theano.tensor as T
--- a/embeddings/parameters.py	Tue Oct 28 00:23:53 2008 -0400
+++ b/embeddings/parameters.py	Tue Oct 28 01:33:27 2008 -0400
@@ -1,8 +1,10 @@
 """
 Locations of the embedding data files.
 """
-WEIGHTSFILE     = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt"
-VOCABFILE       = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc"
+#WEIGHTSFILE     = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+#VOCABFILE       = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc"
+WEIGHTSFILE     = "/home/joseph/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+VOCABFILE       = "/home/joseph/data/word_embeddings.collobert-and-weston/words.asc"
 NUMBER_OF_WORDS = 30000
 DIMENSIONS      = 50
 UNKNOWN         = "UNKNOWN"
--- a/sandbox/weights.py	Tue Oct 28 00:23:53 2008 -0400
+++ b/sandbox/weights.py	Tue Oct 28 01:33:27 2008 -0400
@@ -4,22 +4,37 @@
 @note: We assume that numpy.random.seed() has already been performed.
 """
 
-from math import sqrt
+from math import pow, sqrt
 import numpy.random
-def random_weights(nin, nout, scale_by=sqrt(3)):
+
+sqrt3 = sqrt(3.0)
+def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5):
     """
     Generate an initial weight matrix with nin inputs (rows) and nout
     outputs (cols).
     Each weight is chosen uniformly at random to be in range:
-        [-scale_by/sqrt(nin), +scale_by/sqrt(nin)]
-    @note: Play with scale_by!
-    Ronan derives scale_by=sqrt(3) because that gives variance of
-    1 to something (I forget, ask Yoshua for the derivation). However,
-    Ronan got better results by accidentally using scale_by=1. Yoshua
-    hypothesizes this is because the variance will get telescopically
-    smaller as we go up the layers [need more explanation of this
-    argument].
-    @note: Things may get even trickier if the same weights are being
-    shared in multiple places.
+        [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)]
+    @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3
+    power=0.5 is strongly recommanded (see below).
+
+    Suppose these weights w are used in dot products as follows:
+       output = w' input
+    If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then
+       Var[w]=r2/3
+       Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3
+    To make sure that variance is not changed after the dot product,
+    we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d).  This choice
+    corresponds to the default values scale_by=sqrt(3) and power=0.5.
+    More generally we see that Var[output] = Var[input] * scale_by.
+
+    Now, if these are weights in a deep multi-layer neural network,
+    we would like the top layers to be initially more linear, so as to let
+    gradients flow back more easily (this is an explanation by Ronan Collobert).
+    To achieve this we want scale_by smaller than 1.
+    Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1
+    in the experiment of his ICML'2008 paper.
+    Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity,
+    the variance of the layer outputs would go down roughly by a factor 'scale_by' at each
+    layer (making the layers more linear as we go up towards the output).
     """
-    return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by / sqrt(nin)
+    return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)