# HG changeset patch
# User Joseph Turian <turian@gmail.com>
# Date 1225205666 14400
# Node ID 4f3c66146f170d5481cc4ce22a8006c92ea72252
# Parent  bb6bdd3b7ff386bc61b6ec3d56f61e0cc6739d2d
Moved weights.py out of sandbox

diff -r bb6bdd3b7ff3 -r 4f3c66146f17 sandbox/weights.py
--- a/sandbox/weights.py	Tue Oct 28 03:31:36 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-"""
-Routine to initialize weights.
-
-@note: We assume that numpy.random.seed() has already been performed.
-"""
-
-from math import pow, sqrt
-import numpy.random
-
-sqrt3 = sqrt(3.0)
-def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5):
-    """
-    Generate an initial weight matrix with nin inputs (rows) and nout
-    outputs (cols).
-    Each weight is chosen uniformly at random to be in range:
-        [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)]
-    @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3
-    power=0.5 is strongly recommanded (see below).
-
-    Suppose these weights w are used in dot products as follows:
-       output = w' input
-    If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then
-       Var[w]=r2/3
-       Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3
-    To make sure that variance is not changed after the dot product,
-    we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d).  This choice
-    corresponds to the default values scale_by=sqrt(3) and power=0.5.
-    More generally we see that Var[output] = Var[input] * scale_by.
-
-    Now, if these are weights in a deep multi-layer neural network,
-    we would like the top layers to be initially more linear, so as to let
-    gradients flow back more easily (this is an explanation by Ronan Collobert).
-    To achieve this we want scale_by smaller than 1.
-    Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1
-    in the experiment of his ICML'2008 paper.
-    Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity,
-    the variance of the layer outputs would go down roughly by a factor 'scale_by' at each
-    layer (making the layers more linear as we go up towards the output).
-    """
-    return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)
diff -r bb6bdd3b7ff3 -r 4f3c66146f17 weights.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/weights.py	Tue Oct 28 10:54:26 2008 -0400
@@ -0,0 +1,40 @@
+"""
+Routine to initialize weights.
+
+@note: We assume that numpy.random.seed() has already been performed.
+"""
+
+from math import pow, sqrt
+import numpy.random
+
+sqrt3 = sqrt(3.0)
+def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5):
+    """
+    Generate an initial weight matrix with nin inputs (rows) and nout
+    outputs (cols).
+    Each weight is chosen uniformly at random to be in range:
+        [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)]
+    @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3
+    power=0.5 is strongly recommanded (see below).
+
+    Suppose these weights w are used in dot products as follows:
+       output = w' input
+    If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then
+       Var[w]=r2/3
+       Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3
+    To make sure that variance is not changed after the dot product,
+    we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d).  This choice
+    corresponds to the default values scale_by=sqrt(3) and power=0.5.
+    More generally we see that Var[output] = Var[input] * scale_by.
+
+    Now, if these are weights in a deep multi-layer neural network,
+    we would like the top layers to be initially more linear, so as to let
+    gradients flow back more easily (this is an explanation by Ronan Collobert).
+    To achieve this we want scale_by smaller than 1.
+    Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1
+    in the experiment of his ICML'2008 paper.
+    Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity,
+    the variance of the layer outputs would go down roughly by a factor 'scale_by' at each
+    layer (making the layers more linear as we go up towards the output).
+    """
+    return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)