# HG changeset patch # User Joseph Turian # Date 1236733096 14400 # Node ID b282a5c2f76b6d4f15796203aa8ae9d9b9bdcdd6 # Parent 6c602a86e7119b8965a77979c1abebd3e5fce463 Updated comments in nlpoisson cost diff -r 6c602a86e711 -r b282a5c2f76b pylearn/algorithms/sandbox/cost.py --- a/pylearn/algorithms/sandbox/cost.py Tue Mar 10 19:03:38 2009 -0400 +++ b/pylearn/algorithms/sandbox/cost.py Tue Mar 10 20:58:16 2009 -0400 @@ -18,8 +18,8 @@ """ @staticmethod def st_impl(x): - if not isinstance(x, int): - raise TypeError('type(x) = %s, must be int' % type(x)) + if not isinstance(x, int) and not isinstance(x, long): + raise TypeError('type(x) = %s, must be int or long' % type(x)) if x == 0.0: return 0.0 v = 0.0 @@ -65,12 +65,27 @@ Output should be of the form Weight*code+bias, i.e. unsquashed. NB this is different than the formulation in Salakhutdinov and Hinton - (2007), in which the output is softmax'ed and multiplied by the - input document length. + (2007), in which the output is softmax'ed and multiplied by the input + document length. That is also what Welling et. al (2005) do. It would + be useful to try the softmax, because it is more well-behaved. There is a beta term that is proportional to document length. We are not sure what beta scale is used by the authors. We use 1 as the default, but this value might be inappropriate. + For numerical reasons, Yoshua recommends choosing beta such that + the lambda is expected to be around 1 for words that have a non-zero count. + So he would take: + + beta = document_size / unique_words_per_document + + I am not sure the above math is correct, I need to talk to him. + + Yoshua notes that ``there is a x_i log(beta) term missing, if you + compare with eqn 2 (i.e., take the log). They did not include in + 3 because it does not depend on the parameters, so the gradient + wrt it would be 0. But if you really want log-likelihood it should + be included.'' If you want a true log-likelihood, you probably should + actually compute the derivative of the entire eqn 2. Axis is the axis along which we sum the target values, to obtain the document length. @@ -80,8 +95,12 @@ If zerothreshold is non-zero, we threshold the loss: If this target dimension is zero and beta * tensor.exp(output) < zerothreshold, let this loss be zero. + + @todo: Include logfactorial term """ # from theano.printing import Print +# print dtype(target) # make sure dtype is int32 or int64 +# print target.dtype doclen = tensor.sum(target, axis=axis) lambdav = poissonlambda(output, doclen, beta_scale) lossterms = lambdav - target*output