view pylearn/algorithms/sandbox/cost.py @ 1498:0f326860210e

Merged
author Olivier Delalleau <delallea@iro>
date Thu, 01 Sep 2011 13:35:15 -0400
parents 576eb7f77c35
children
line wrap: on
line source

"""
Cost functions.

@note: All of these functions return one cost per example. So it is your
job to perform a tensor.sum over the individual example losses.
"""

import theano as T
from theano import tensor, scalar
import numpy

class UndefinedGradient(Exception):
    """
    Raised by UndefinedGradientOp to indicate that the gradient is undefined mathematically.
    """
    pass
from theano import gof
class UndefinedGradientOp(gof.Op):
    def perform(self, x=None):
        if x is not None: raise UndefinedGradient(x)
        else: raise UndefinedGradient(x)
undefined_gradient = UndefinedGradientOp()

class LogFactorial(scalar.UnaryScalarOp):
    """
    Compute log x!.
    @todo: Rewrite so that it uses INTs not FLOATs.
    @todo: Move this to Theano.
    @todo: This function is slow, probably want to cache the values.
    """
    @staticmethod
    def st_impl(x):
        if not isinstance(x, int) and not isinstance(x, long):
            raise TypeError('type(x) = %s, must be int or long' % type(x))
        if x == 0.0:
            return 0.0
        v = 0.0
        for i in range(x):
            v += numpy.log(x)
        return v
    def impl(self, x):
        return LogFactorial.st_impl(x)
    def grad(self, (x,), (gz,)):
        undefined_gradient(self)
#    def grad(self, (x,), (gz,)):
#        raise NotImplementedError('gradient not defined over discrete values')
#        return None
#        return [gz * (1 + scalar.log(x))]
#    def c_code(self, node, name, (x,), (z,), sub):
#        if node.inputs[0].type in [scalar.float32, scalar.float64]:
#            return """%(z)s =
#                %(x)s == 0.0
#                ? 0.0
#                : %(x)s * log(%(x)s);""" % locals()
#        raise NotImplementedError('only floatingpoint is implemented')
scalar_logfactorial = LogFactorial(scalar.upgrade_to_float, name='scalar_logfactoral')
logfactorial = tensor.Elemwise(scalar_logfactorial, name='logfactorial')


def poissonlambda(unscaled_output, doclen, beta_scale):
    """
    A continuous parameter lambda_i which is the expected number of
    occurence of word i in the document.  Note how this must be positive,
    and that is why Ranzato and Szummer (2008) use an exponential.

    Yoshua: I don't like exponentials to guarantee positivity. softplus
    is numerically much better behaved (but you might want to try both
    to see if it makes a difference).

    @todo: Maybe there are more sensible ways to set the beta_scale.
    """
    beta = beta_scale * doclen
    return beta * tensor.exp(unscaled_output)

def nlpoisson(target, output, beta_scale=1, axis=0, sumloss=True, zerothreshold=0):
    """
    The negative log Poisson regression probability.
    From Ranzato and Szummer (2008).

    Output should be of the form Weight*code+bias, i.e. unsquashed.
    NB this is different than the formulation in Salakhutdinov and Hinton
    (2007), in which the output is softmax'ed and multiplied by the input
    document length. That is also what Welling et. al (2005) do.  It would
    be useful to try the softmax, because it is more well-behaved.

    There is a beta term that is proportional to document length. We
    are not sure what beta scale is used by the authors. We use 1 as
    the default, but this value might be inappropriate.
    For numerical reasons, Yoshua recommends choosing beta such that
    the lambda is expected to be around 1 for words that have a non-zero count.
    So he would take:

      beta = document_size / unique_words_per_document

    I am not sure the above math is correct, I need to talk to him.

    Yoshua notes that ``there is a x_i log(beta) term missing, if you
    compare with eqn 2 (i.e., take the log). They did not include in
    3 because it does not depend on the parameters, so the gradient
    wrt it would be 0. But if you really want log-likelihood it should
    be included.'' If you want a true log-likelihood, you probably should
    actually compute the derivative of the entire eqn 2.

    Axis is the axis along which we sum the target values, to obtain
    the document length.

    If sumloss, we sum the loss along axis.

    If zerothreshold is non-zero, we threshold the loss:
        If this target dimension is zero and beta * tensor.exp(output)
        < zerothreshold, let this loss be zero.

    @todo: Include logfactorial term
    """
#    from theano.printing import Print
#    print dtype(target)        # make sure dtype is int32 or int64
#    print target.dtype
    doclen = tensor.sum(target, axis=axis)
    lambdav = poissonlambda(output, doclen, beta_scale)
    lossterms = lambdav - target*output
    if sumloss:
        return tensor.sum(lossterms, axis=axis)
    else:
        return lossterms
#    return tensor.sum(beta * tensor.exp(output) - target*output + logfactorial(target), axis=axis)


#import numpy
#def nlpoisson_nontheano(target, output, beta_scale=1, axis=0):
#    doclen = numpy.sum(target, axis=axis)
#    print "doclen", doclen
#    beta = beta_scale * doclen
#    print "beta", beta
#    print "exp", numpy.exp(output)
#    print "beta * exp", beta * numpy.exp(output)
#    print "x * y", target * output
#
#    import theano.tensor as TT
#    x = TT.as_tensor(target)
#    o = logfactorial(x)
#    f = T.function([],o)
#    logf = f()
#    print "log factorial(x)", logf
#    print "beta * exp - dot + log factorial", beta * numpy.exp(output) - target*output + f()
#    print "total loss", numpy.sum(beta * numpy.exp(output) - target*output + f(), axis=axis)
#
##    return beta * numpy.exp(output) - numpy.dot(target, output)
##            #+ logfactorial(target)
#
#import numpy
#target = numpy.array([0, 0, 1, 1, 2, 2, 100, 100])
##output = numpy.array([0., 0.5, 1., 0.5, 2., 0.5, 100., 0.5])
#output = numpy.array([0., 1, 1., 0, 1, 0, 5, 1])
#nlpoisson_nontheano(target, output)