# HG changeset patch
# User Joseph Turian <turian@gmail.com>
# Date 1236733096 14400
# Node ID b282a5c2f76b6d4f15796203aa8ae9d9b9bdcdd6
# Parent  6c602a86e7119b8965a77979c1abebd3e5fce463
Updated comments in nlpoisson cost

diff -r 6c602a86e711 -r b282a5c2f76b pylearn/algorithms/sandbox/cost.py
--- a/pylearn/algorithms/sandbox/cost.py	Tue Mar 10 19:03:38 2009 -0400
+++ b/pylearn/algorithms/sandbox/cost.py	Tue Mar 10 20:58:16 2009 -0400
@@ -18,8 +18,8 @@
     """
     @staticmethod
     def st_impl(x):
-        if not isinstance(x, int):
-            raise TypeError('type(x) = %s, must be int' % type(x))
+        if not isinstance(x, int) and not isinstance(x, long):
+            raise TypeError('type(x) = %s, must be int or long' % type(x))
         if x == 0.0:
             return 0.0
         v = 0.0
@@ -65,12 +65,27 @@
 
     Output should be of the form Weight*code+bias, i.e. unsquashed.
     NB this is different than the formulation in Salakhutdinov and Hinton
-    (2007), in which the output is softmax'ed and multiplied by the
-    input document length.
+    (2007), in which the output is softmax'ed and multiplied by the input
+    document length. That is also what Welling et. al (2005) do.  It would
+    be useful to try the softmax, because it is more well-behaved.
 
     There is a beta term that is proportional to document length. We
     are not sure what beta scale is used by the authors. We use 1 as
     the default, but this value might be inappropriate.
+    For numerical reasons, Yoshua recommends choosing beta such that
+    the lambda is expected to be around 1 for words that have a non-zero count.
+    So he would take:
+
+      beta = document_size / unique_words_per_document
+
+    I am not sure the above math is correct, I need to talk to him.
+
+    Yoshua notes that ``there is a x_i log(beta) term missing, if you
+    compare with eqn 2 (i.e., take the log). They did not include in
+    3 because it does not depend on the parameters, so the gradient
+    wrt it would be 0. But if you really want log-likelihood it should
+    be included.'' If you want a true log-likelihood, you probably should
+    actually compute the derivative of the entire eqn 2.
 
     Axis is the axis along which we sum the target values, to obtain
     the document length.
@@ -80,8 +95,12 @@
     If zerothreshold is non-zero, we threshold the loss:
         If this target dimension is zero and beta * tensor.exp(output)
         < zerothreshold, let this loss be zero.
+
+    @todo: Include logfactorial term
     """
 #    from theano.printing import Print
+#    print dtype(target)        # make sure dtype is int32 or int64
+#    print target.dtype
     doclen = tensor.sum(target, axis=axis)
     lambdav = poissonlambda(output, doclen, beta_scale)
     lossterms = lambdav - target*output