view pylearn/sampling/hmc.py @ 1205:5525cf3faaa2

requirements: Question about the serialization requirement
author Olivier Delalleau <delallea@iro>
date Tue, 21 Sep 2010 10:48:01 -0400
parents 492473059b37
children 34512d1d4e9c
line wrap: on
line source

"""Hybrid / Hamiltonian Monte Carlo Sampling

This algorithm is described in Radford Neal's PhD Thesis, pages 63--70.

"""
import sys
import logging
import numpy as np
from theano import function, shared
from theano import tensor as TT
import theano.sparse #installs the sparse shared var handler


#TODO: 
#  Consider how to refactor this code so that the key functions /functionality are provided as
#  theano expressions??
#  It could be limiting that the implementation requires that the position be a list of shared
#  variables, and basically uses procedural logic on top of that.

#TODO: 
# Consider a heuristic for updating the *MASS* of the particles.  We might want the mass to be
# such that the momentum is in the same range as the gradient on the energy.  Look at Radford's
# recent book chapter, maybe there are hints. (2010).

class HMC_sampler(object):
    """Batch-wise Hybrid Monte-Carlo sampler


    The `draw` function advances the markov chain and returns the current sample by calling
    `simulate` and `get_position` in sequence.


    :note:

      The 'mass' of so-called particles is taken to be 1, so that kinetic energy (K) is the sum
      of squared velocities (p).

        :math:`K = \sum_i p_i^2 / 2`

    """

    # Constants taken from Marc'Aurelio's 'train_mcRBM.py' file found in the code online for his
    # paper.
    stepsize_dec = 0.98
    stepsize_min = 0.001
    stepsize_max = 0.25
    stepsize_inc = 1.02
    avg_acceptance_slowness = 0.9  # used in geometric avg. 1.0 would be not moving at all
    n_steps=20

    def __init__(self, positions, energy_fn, 
            velocity=None,
            initial_stepsize=0.01,
            target_acceptance_rate=0.9,
            seed=12345,
            dtype=theano.config.floatX):
        """
        :param positions: list of shared ndarray variables.

        :param energy: 

            callable such that energy_fn(positions) 
            returns theano vector of energies.  
            The len of this vector is the batchsize.

            The sum of this energy vector must be differentiable (with theano.tensor.grad) with
            respect to the positions for HMC sampling to work.

        """

        self.stepsize = initial_stepsize
        batchsize = positions[0].value.shape[0]
        self.target_acceptance_rate = target_acceptance_rate
        self.avg_acceptance_rate = self.target_acceptance_rate
        self.s_rng = TT.shared_randomstreams.RandomStreams(seed)
        self.positions = positions
        if velocity is None:
            self.velocity = [shared(np.zeros_like(q.value)) for q in self.positions]
        else:
            self.velocity = velocity

        self.start_positions = [shared(np.zeros_like(q.value)) for q in self.positions]
        self.initial_hamiltonian = shared(np.zeros(batchsize).astype(dtype) + float('inf'))

        energy = energy_fn(positions)
        # assuming mass is 1
        kinetic_energy = 0.5 * sum([(p**2).sum(axis=1) for p in self.velocity])
        hamiltonian = energy + kinetic_energy

        dE_dpos = TT.grad(energy.sum(), self.positions)

        s_stepsize = TT.scalar('stepsize')

        self.velocity_step = function([s_stepsize],
                [dE_dpos[0].norm(2)],
                updates=[(p, p - s_stepsize*dE_dq) for p,dE_dq in zip(self.velocity,dE_dpos)])
        self.position_step = function([s_stepsize], [],
                updates=[(q, q + s_stepsize*p) for (q,p) in zip(self.positions, self.velocity)])

        self.save_start_positions = function([], [],
                updates=[(self.initial_hamiltonian, hamiltonian)] + \
                        [(sq, q) for sq, q in zip(self.start_positions, self.positions)])

        # sample the initial velocity from a
        # gaussian with mean 0.
        # Note: I think the fact that this distribution is symmetric about zero justifies not
        # sampling forward versus backward dynamics.
        self.randomize_velocity = function([], [],
                updates=[(p, self.s_rng.normal(size=p.value.shape)) for p in self.velocity])


        # accept-reject according to Metropolis algo

        accept = TT.exp(self.initial_hamiltonian - hamiltonian) - self.s_rng.uniform(size=(batchsize,)) >= 0

        self.accept_reject_positions = function([], accept.mean(),
                updates=[
                    (self.initial_hamiltonian, 
                        TT.switch(accept, hamiltonian, self.initial_hamiltonian))] + [
                    (q, 
                        TT.switch(accept.dimshuffle(0, *(('x',)*(sq.ndim-1))), q, sq)) 
                        for sq, q in zip(self.start_positions, self.positions)])

    def simulate(self, n_steps=None):
        if n_steps is None:
            n_steps = self.n_steps

        # updates self.velocity with new random numbers
        self.randomize_velocity()
        self.save_start_positions()

        if 0:
            # not necessary for random initial direction
            if np.random.rand() > 0.5:
                step_amount = self.stepsize
            else:
                step_amount = -self.stepsize
        else:
            step_amount = self.stepsize

        if 0:
            print "initial",
            print "kinetic E", self.prev_energy.value ,
            print (self.velocity[0].value**2).sum(axis=1),
            print (self.velocity[0].value**2).sum(axis=1) + self.prev_energy.value


        # Note on the order of leap-frog steps:
        #
        # the leap-frog algorithm can start with either position or velocity updates first,
        # but one of them has to run an extra time (because of two half-steps).
        # The position_step is cheap to evaluate, whereas the velocity_step is expensive,
        # so we evaluate the position_step the extra time.
        #
        # At the same time, we cannot drop the last half-step update of the position, because
        # the position is actually the terms we care about.

        #opening half-step in leap-frog algorithm
        self.position_step(step_amount/2.0)

        # full leap-frog steps
        for ss in range(n_steps):
            self.velocity_step(step_amount)
            if ss == n_steps-1:
                # closing half-step in the leap-frog algorithm
                self.position_step(step_amount/2.0)
            else:
                self.position_step(step_amount)

        acceptance_rate = self.accept_reject_positions()
        self.avg_acceptance_rate = self.avg_acceptance_slowness * self.avg_acceptance_rate \
                + (1.0 - self.avg_acceptance_slowness) * acceptance_rate

        if self.avg_acceptance_rate < self.target_acceptance_rate:
            self.stepsize = max(self.stepsize*self.stepsize_dec,self.stepsize_min)
        else:
            self.stepsize = min(self.stepsize*self.stepsize_inc,self.stepsize_max)

        if 0:
            print "final kinetic E", self.prev_energy.value ,
            print (self.velocity[0].value**2).sum(axis=1),
            print (self.velocity[0].value**2).sum(axis=1) + self.prev_energy.value


        # post-condition: 
        # self.positions contains a new sample from our markov chain

        # it is not returned from this function because accessing the .value of a shared
        # variable can require making a copy
        # see `draw()` or `get_position` for that behaviour.

    def get_position(self):
        return [q.value for q in self.positions]

    def draw(self, n_steps=None):
        """Return the current sample in the Markov chain as a list of numpy arrays

        The size of the arrays will match the size of the `initial_position` argument to
        __init__.
        """
        self.simulate(n_steps=n_steps)
        return self.get_position()