view sandbox/rbm/model.py @ 442:b3315b252824

Finished derivative of softmax gradient.
author Pascal Lamblin <lamblinp@iro.umontreal.ca>
date Fri, 22 Aug 2008 15:53:34 -0400
parents 4f61201fa9a9
children
line wrap: on
line source

"""
The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason
Weston's sampling trick (2008).
"""

import parameters

import numpy
from numpy import dot
import random

import pylearn.nnet_ops
import pylearn.sparse_instance

def sigmoid(v):
    """
    @todo: Move to pylearn.more_numpy
    @todo: Fix to avoid floating point overflow.
    """
#    if x < -30.0: return 0.0
#    if x > 30.0: return 1.0 
    return 1.0 / (1.0 + numpy.exp(-v))

def sample(v):
    """
    @todo: Move to pylearn.more_numpy
    """
    assert len(v.shape) == 2
    x = numpy.zeros(v.shape)
    for j in range(v.shape[0]):
        for i in range(v.shape[1]):
            assert v[j][i] >= 0 and v[j][i] <= 1
            if random.random() < v[j][i]: x[j][i] = 1
            else: x[j][i] = 0
    return x

def crossentropy(output, target):
    """
    Compute the crossentropy of binary output wrt binary target.
    @note: We do not sum, crossentropy is computed by component.
    @todo: Rewrite as a scalar, and then broadcast to tensor.
    @todo: Move to pylearn.more_numpy
    @todo: Fix to avoid floating point overflow.
    """
    return -(target * numpy.log(output) + (1 - target) * numpy.log(1 - output))


class Model:
    """
    @todo: input dimensions should be stored here! not as a global.
    """
    def __init__(self, input_dimension, hidden_dimension, learning_rate = 0.1, momentum = 0.9, weight_decay = 0.0002, random_seed = 666):
        self.input_dimension    = input_dimension
        self.hidden_dimension   = hidden_dimension
        self.learning_rate      = learning_rate
        self.momentum           = momentum
        self.weight_decay       = weight_decay
        self.random_seed        = random_seed

        random.seed(random_seed)

        self.parameters = parameters.Parameters(input_dimension=self.input_dimension, hidden_dimension=self.hidden_dimension, randomly_initialize=True, random_seed=self.random_seed)
        self.prev_dw = 0
        self.prev_db = 0
        self.prev_dc = 0

    def deterministic_reconstruction(self, v0):
        """
        One up-down cycle, but a mean-field approximation (no sampling).
        """
        q = sigmoid(self.parameters.b + dot(v0, self.parameters.w))
        p = sigmoid(self.parameters.c + dot(q, self.parameters.w.T))
        return p

    def deterministic_reconstruction_error(self, v0):
        """
        @note: According to Yoshua, -log P(V1 = v0 | tilde(h)(v0)).
        """
        return crossentropy(self.deterministic_reconstruction(v0), v0)

    def update(self, instances):
        """
        Update the L{Model} using one training instance.
        @param instance: A dict from feature index to (non-zero) value.
        @todo: Should assert that nonzero_indices and zero_indices
        are correct (i.e. are truly nonzero/zero).
        @todo: Multiply L{self.weight_decay} by L{self.learning_rate}, as done in Semantic Hashing?
        @todo: Decay the biases too?
        """
        minibatch = len(instances)
        v0 = pylearn.sparse_instance.to_vector(instances, self.input_dimension)
        print "old XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch
        q0 = sigmoid(self.parameters.b + dot(v0, self.parameters.w))
        h0 = sample(q0)
        p0 = sigmoid(self.parameters.c + dot(h0, self.parameters.w.T))
        v1 = sample(p0)
        q1 = sigmoid(self.parameters.b + dot(v1, self.parameters.w))

        dw = self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch + self.momentum * self.prev_dw
        db = self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch + self.momentum * self.prev_db
        dc = self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch + self.momentum * self.prev_dc

        self.parameters.w *= (1 - self.weight_decay)

        self.parameters.w += dw
        self.parameters.b += db
        self.parameters.c += dc

        self.last_dw = dw
        self.last_db = db
        self.last_dc = dc

        print "new XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch

#        print
#        print "v[0]:", v0
#        print "Q(h[0][i] = 1 | v[0]):", q0
#        print "h[0]:", h0
#        print "P(v[1][j] = 1 | h[0]):", p0
#        print "XENT(P(v[1][j] = 1 | h[0]) | v0):", numpy.sum(crossentropy(p0, v0))
#        print "v[1]:", v1
#        print "Q(h[1][i] = 1 | v[1]):", q1
#
#        print
#        print v0.T.shape
#        print h0.shape
#        print dot(v0.T, h0).shape
#        print self.parameters.w.shape
#        self.parameters.w += self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch
#        print
#        print h0.shape
#        print q1.shape
#        print self.parameters.b.shape
#        self.parameters.b += self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch
#        print v0.shape, v1.shape
#        print
#        print self.parameters.c.shape
#        self.parameters.c += self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch
#        print self.parameters