# HG changeset patch
# User Olivier Breuleux <breuleuo@iro.umontreal.ca>
# Date 1225208387 14400
# Node ID 6dfdcee64e9bda256329d8c72ec29c690f5a9c31
# Parent  180d125dc7e2c01280cc832df82c3972aab534b9# Parent  4f3c66146f170d5481cc4ce22a8006c92ea72252
merge

diff -r 180d125dc7e2 -r 6dfdcee64e9b algorithms/daa.py
--- a/algorithms/daa.py	Tue Oct 28 11:39:27 2008 -0400
+++ b/algorithms/daa.py	Tue Oct 28 11:39:47 2008 -0400
@@ -4,14 +4,23 @@
 from theano.tensor import nnet as NN
 import numpy as N
 
+from pylearn import cost as cost
+
 class DenoisingAA(T.RModule):
 
-    def __init__(self, input = None, regularize = True, tie_weights = True):
+    def __init__(self, input = None, regularize = True, tie_weights = True,
+            activation_function=NN.sigmoid, reconstruction_cost_function=cost.cross_entropy):
+        """
+        @param reconstruction_cost: Should return one cost per example (row)
+        @todo: Default noise level for all daa levels
+        """
         super(DenoisingAA, self).__init__()
 
         # MODEL CONFIGURATION
         self.regularize = regularize
         self.tie_weights = tie_weights
+        self.activation_function = activation_function
+        self.reconstruction_cost_function = reconstruction_cost_function
 
         # ACQUIRE/MAKE INPUT
         if not input:
@@ -94,7 +103,7 @@
 
     def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init):
         if (input_size is None) ^ (hidden_size is None):
-            raise ValueError("Must specify hidden_size and target_size or neither.")
+            raise ValueError("Must specify input_size and hidden_size or neither.")
         super(DenoisingAA, self)._instance_initialize(obj, **init)
         if seed is not None:
             R = N.random.RandomState(seed)
@@ -114,31 +123,37 @@
         obj.__hide__ = ['params']
 
     def build_regularization(self):
+        """
+        @todo: Why do we need this function?
+        """
         return T.zero() # no regularization!
 
 
 class SigmoidXEDenoisingAA(DenoisingAA):
+    """
+    @todo: Merge this into the above.
+    @todo: Default noise level for all daa levels
+    """
 
     def build_corrupted_input(self):
         self.noise_level = theano.Member(T.scalar())
         return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input
 
     def hid_activation_function(self, activation):
-        return NN.sigmoid(activation)
+        return self.activation_function(activation)
 
     def out_activation_function(self, activation):
-        return NN.sigmoid(activation)
+        return self.activation_function(activation)
 
     def build_reconstruction_costs(self, output):
-        reconstruction_cost_matrix = -(self.input * T.log(output) + (1 - self.input) * T.log(1 - output))
-        return T.sum(reconstruction_cost_matrix, axis=1)
+        return self.reconstruction_cost_function(self.input, output)
 
     def build_regularization(self):
         self.l2_coef = theano.Member(T.scalar())
         if self.tie_weights:
             return self.l2_coef * T.sum(self.w1 * self.w1)
         else:
-            return self.l2_coef * T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2)
+            return self.l2_coef * (T.sum(self.w1 * self.w1) + T.sum(self.w2 * self.w2))
 
     def _instance_initialize(self, obj, input_size = None, hidden_size = None, seed = None, **init):
         init.setdefault('noise_level', 0)
diff -r 180d125dc7e2 -r 6dfdcee64e9b algorithms/stacker.py
--- a/algorithms/stacker.py	Tue Oct 28 11:39:27 2008 -0400
+++ b/algorithms/stacker.py	Tue Oct 28 11:39:47 2008 -0400
@@ -5,6 +5,9 @@
 import numpy as N
 
 class Stacker(T.RModule):
+    """
+    @todo: Maybe compile functions on demand, rather than immediately.
+    """
 
     def __init__(self, submodules, input = None, regularize = False):
         super(Stacker, self).__init__()
diff -r 180d125dc7e2 -r 6dfdcee64e9b cost.py
--- a/cost.py	Tue Oct 28 11:39:27 2008 -0400
+++ b/cost.py	Tue Oct 28 11:39:47 2008 -0400
@@ -3,13 +3,15 @@
 
 @note: All of these functions return one cost per example. So it is your
 job to perform a tensor.sum over the individual example losses.
+
+@todo: It would be nice to implement a hinge loss, with a particular margin.
 """
 
 import theano.tensor as T
 from xlogx import xlogx
 
 def quadratic(target, output, axis=1):
-    return T.mean(T.sqr(target - output), axis)
+    return T.mean(T.sqr(target - output), axis=axis)
 
 def cross_entropy(target, output, axis=1):
     """
diff -r 180d125dc7e2 -r 6dfdcee64e9b embeddings/parameters.py
--- a/embeddings/parameters.py	Tue Oct 28 11:39:27 2008 -0400
+++ b/embeddings/parameters.py	Tue Oct 28 11:39:47 2008 -0400
@@ -1,8 +1,10 @@
 """
 Locations of the embedding data files.
 """
-WEIGHTSFILE     = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt"
-VOCABFILE       = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc"
+#WEIGHTSFILE     = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+#VOCABFILE       = "/home/fringant2/lisa/data/word_embeddings.collobert-and-weston/words.asc"
+WEIGHTSFILE     = "/home/joseph/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+VOCABFILE       = "/home/joseph/data/word_embeddings.collobert-and-weston/words.asc"
 NUMBER_OF_WORDS = 30000
 DIMENSIONS      = 50
 UNKNOWN         = "UNKNOWN"
diff -r 180d125dc7e2 -r 6dfdcee64e9b sandbox/weights.py
--- a/sandbox/weights.py	Tue Oct 28 11:39:27 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-"""
-Routine to initialize weights.
-
-@note: We assume that numpy.random.seed() has already been performed.
-"""
-
-from math import sqrt
-import numpy.random
-def random_weights(nin, nout, scale_by=sqrt(3)):
-    """
-    Generate an initial weight matrix with nin inputs (rows) and nout
-    outputs (cols).
-    Each weight is chosen uniformly at random to be in range:
-        [-scale_by/sqrt(nin), +scale_by/sqrt(nin)]
-    @note: Play with scale_by!
-    Ronan derives scale_by=sqrt(3) because that gives variance of
-    1 to something (I forget, ask Yoshua for the derivation). However,
-    Ronan got better results by accidentally using scale_by=1. Yoshua
-    hypothesizes this is because the variance will get telescopically
-    smaller as we go up the layers [need more explanation of this
-    argument].
-    @note: Things may get even trickier if the same weights are being
-    shared in multiple places.
-    """
-    return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by / sqrt(nin)
diff -r 180d125dc7e2 -r 6dfdcee64e9b weights.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/weights.py	Tue Oct 28 11:39:47 2008 -0400
@@ -0,0 +1,40 @@
+"""
+Routine to initialize weights.
+
+@note: We assume that numpy.random.seed() has already been performed.
+"""
+
+from math import pow, sqrt
+import numpy.random
+
+sqrt3 = sqrt(3.0)
+def random_weights(nin, nout, scale_by=1./sqrt3, power=0.5):
+    """
+    Generate an initial weight matrix with nin inputs (rows) and nout
+    outputs (cols).
+    Each weight is chosen uniformly at random to be in range:
+        [-scale_by*sqrt(3)/pow(nin,power), +scale_by*sqrt(3)/pow(nin,power)]
+    @note: Play with scale_by, but reasonable values are <=1, maybe 1./sqrt3
+    power=0.5 is strongly recommanded (see below).
+
+    Suppose these weights w are used in dot products as follows:
+       output = w' input
+    If w ~ Uniform(-r,r) and Var[input_i]=1 and x_i's are independent, then
+       Var[w]=r2/3
+       Var[output] = Var[ sum_{i=1}^d w_i input_i] = d r2 / 3
+    To make sure that variance is not changed after the dot product,
+    we therefore want Var[output]=1 and r = sqrt(3)/sqrt(d).  This choice
+    corresponds to the default values scale_by=sqrt(3) and power=0.5.
+    More generally we see that Var[output] = Var[input] * scale_by.
+
+    Now, if these are weights in a deep multi-layer neural network,
+    we would like the top layers to be initially more linear, so as to let
+    gradients flow back more easily (this is an explanation by Ronan Collobert).
+    To achieve this we want scale_by smaller than 1.
+    Ronan used scale_by=1/sqrt(3) (by mistake!) and got better results than scale_by=1
+    in the experiment of his ICML'2008 paper.
+    Note that if we have a multi-layer network, ignoring the effect of the tanh non-linearity,
+    the variance of the layer outputs would go down roughly by a factor 'scale_by' at each
+    layer (making the layers more linear as we go up towards the output).
+    """
+    return (numpy.random.rand(nin, nout) * 2.0 - 1) * scale_by * sqrt3 / pow(nin,power)