Mercurial > pylearn
view sandbox/rbm/model.py @ 442:b3315b252824
Finished derivative of softmax gradient.
author | Pascal Lamblin <lamblinp@iro.umontreal.ca> |
---|---|
date | Fri, 22 Aug 2008 15:53:34 -0400 |
parents | 4f61201fa9a9 |
children |
line wrap: on
line source
""" The model for an autoassociator for sparse inputs, using Ronan Collobert + Jason Weston's sampling trick (2008). """ import parameters import numpy from numpy import dot import random import pylearn.nnet_ops import pylearn.sparse_instance def sigmoid(v): """ @todo: Move to pylearn.more_numpy @todo: Fix to avoid floating point overflow. """ # if x < -30.0: return 0.0 # if x > 30.0: return 1.0 return 1.0 / (1.0 + numpy.exp(-v)) def sample(v): """ @todo: Move to pylearn.more_numpy """ assert len(v.shape) == 2 x = numpy.zeros(v.shape) for j in range(v.shape[0]): for i in range(v.shape[1]): assert v[j][i] >= 0 and v[j][i] <= 1 if random.random() < v[j][i]: x[j][i] = 1 else: x[j][i] = 0 return x def crossentropy(output, target): """ Compute the crossentropy of binary output wrt binary target. @note: We do not sum, crossentropy is computed by component. @todo: Rewrite as a scalar, and then broadcast to tensor. @todo: Move to pylearn.more_numpy @todo: Fix to avoid floating point overflow. """ return -(target * numpy.log(output) + (1 - target) * numpy.log(1 - output)) class Model: """ @todo: input dimensions should be stored here! not as a global. """ def __init__(self, input_dimension, hidden_dimension, learning_rate = 0.1, momentum = 0.9, weight_decay = 0.0002, random_seed = 666): self.input_dimension = input_dimension self.hidden_dimension = hidden_dimension self.learning_rate = learning_rate self.momentum = momentum self.weight_decay = weight_decay self.random_seed = random_seed random.seed(random_seed) self.parameters = parameters.Parameters(input_dimension=self.input_dimension, hidden_dimension=self.hidden_dimension, randomly_initialize=True, random_seed=self.random_seed) self.prev_dw = 0 self.prev_db = 0 self.prev_dc = 0 def deterministic_reconstruction(self, v0): """ One up-down cycle, but a mean-field approximation (no sampling). """ q = sigmoid(self.parameters.b + dot(v0, self.parameters.w)) p = sigmoid(self.parameters.c + dot(q, self.parameters.w.T)) return p def deterministic_reconstruction_error(self, v0): """ @note: According to Yoshua, -log P(V1 = v0 | tilde(h)(v0)). """ return crossentropy(self.deterministic_reconstruction(v0), v0) def update(self, instances): """ Update the L{Model} using one training instance. @param instance: A dict from feature index to (non-zero) value. @todo: Should assert that nonzero_indices and zero_indices are correct (i.e. are truly nonzero/zero). @todo: Multiply L{self.weight_decay} by L{self.learning_rate}, as done in Semantic Hashing? @todo: Decay the biases too? """ minibatch = len(instances) v0 = pylearn.sparse_instance.to_vector(instances, self.input_dimension) print "old XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch q0 = sigmoid(self.parameters.b + dot(v0, self.parameters.w)) h0 = sample(q0) p0 = sigmoid(self.parameters.c + dot(h0, self.parameters.w.T)) v1 = sample(p0) q1 = sigmoid(self.parameters.b + dot(v1, self.parameters.w)) dw = self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch + self.momentum * self.prev_dw db = self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch + self.momentum * self.prev_db dc = self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch + self.momentum * self.prev_dc self.parameters.w *= (1 - self.weight_decay) self.parameters.w += dw self.parameters.b += db self.parameters.c += dc self.last_dw = dw self.last_db = db self.last_dc = dc print "new XENT per instance:", numpy.sum(self.deterministic_reconstruction_error(v0))/minibatch # print # print "v[0]:", v0 # print "Q(h[0][i] = 1 | v[0]):", q0 # print "h[0]:", h0 # print "P(v[1][j] = 1 | h[0]):", p0 # print "XENT(P(v[1][j] = 1 | h[0]) | v0):", numpy.sum(crossentropy(p0, v0)) # print "v[1]:", v1 # print "Q(h[1][i] = 1 | v[1]):", q1 # # print # print v0.T.shape # print h0.shape # print dot(v0.T, h0).shape # print self.parameters.w.shape # self.parameters.w += self.learning_rate * (dot(v0.T, h0) - dot(v1.T, q1)) / minibatch # print # print h0.shape # print q1.shape # print self.parameters.b.shape # self.parameters.b += self.learning_rate * numpy.sum(h0 - q1, axis=0) / minibatch # print v0.shape, v1.shape # print # print self.parameters.c.shape # self.parameters.c += self.learning_rate * numpy.sum(v0 - v1, axis=0) / minibatch # print self.parameters