comparison deep/rbm/rbm.py @ 347:9685e9d94cc4

base class for an rbm
author goldfinger
date Mon, 19 Apr 2010 08:16:56 -0400
parents
children d81284e13d77
comparison
equal deleted inserted replaced
338:fca22114bb23 347:9685e9d94cc4
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano.
2
3 Boltzmann Machines (BMs) are a particular form of energy-based model which
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs
5 to those without visible-visible and hidden-hidden connections.
6 """
7
8
9 import numpy, time, cPickle, gzip, PIL.Image
10
11 import theano
12 import theano.tensor as T
13 import os
14
15 from theano.tensor.shared_randomstreams import RandomStreams
16
17 from utils import tile_raster_images
18 from logistic_sgd import load_data
19
20
21 class RBM(object):
22 """Restricted Boltzmann Machine (RBM) """
23 def __init__(self, input=None, n_visible=784, n_hidden=1000, \
24 W = None, hbias = None, vbias = None, numpy_rng = None,
25 theano_rng = None):
26 """
27 RBM constructor. Defines the parameters of the model along with
28 basic operations for inferring hidden from visible (and vice-versa),
29 as well as for performing CD updates.
30
31 :param input: None for standalone RBMs or symbolic variable if RBM is
32 part of a larger graph.
33
34 :param n_visible: number of visible units
35
36 :param n_hidden: number of hidden units
37
38 :param W: None for standalone RBMs or symbolic variable pointing to a
39 shared weight matrix in case RBM is part of a DBN network; in a DBN,
40 the weights are shared between RBMs and layers of a MLP
41
42 :param hbias: None for standalone RBMs or symbolic variable pointing
43 to a shared hidden units bias vector in case RBM is part of a
44 different network
45
46 :param vbias: None for standalone RBMs or a symbolic variable
47 pointing to a shared visible units bias
48 """
49
50 self.n_visible = n_visible
51 self.n_hidden = n_hidden
52
53
54 if W is None :
55 # W is initialized with `initial_W` which is uniformely sampled
56 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
57 # the output of uniform if converted using asarray to dtype
58 # theano.config.floatX so that the code is runable on GPU
59 initial_W = numpy.asarray( numpy.random.uniform(
60 low = -numpy.sqrt(6./(n_hidden+n_visible)),
61 high = numpy.sqrt(6./(n_hidden+n_visible)),
62 size = (n_visible, n_hidden)),
63 dtype = theano.config.floatX)
64 # theano shared variables for weights and biases
65 W = theano.shared(value = initial_W, name = 'W')
66
67 if hbias is None :
68 # create shared variable for hidden units bias
69 hbias = theano.shared(value = numpy.zeros(n_hidden,
70 dtype = theano.config.floatX), name='hbias')
71
72 if vbias is None :
73 # create shared variable for visible units bias
74 vbias = theano.shared(value =numpy.zeros(n_visible,
75 dtype = theano.config.floatX),name='vbias')
76
77 if numpy_rng is None:
78 # create a number generator
79 numpy_rng = numpy.random.RandomState(1234)
80
81 if theano_rng is None :
82 theano_rng = RandomStreams(numpy_rng.randint(2**30))
83
84
85 # initialize input layer for standalone RBM or layer0 of DBN
86 self.input = input if input else T.dmatrix('input')
87
88 self.W = W
89 self.hbias = hbias
90 self.vbias = vbias
91 self.theano_rng = theano_rng
92 # **** WARNING: It is not a good idea to put things in this list
93 # other than shared variables created in this function.
94 self.params = [self.W, self.hbias, self.vbias]
95 self.batch_size = self.input.shape[0]
96
97 def free_energy(self, v_sample):
98 ''' Function to compute the free energy '''
99 wx_b = T.dot(v_sample, self.W) + self.hbias
100 vbias_term = T.sum(T.dot(v_sample, self.vbias))
101 hidden_term = T.sum(T.log(1+T.exp(wx_b)))
102 return -hidden_term - vbias_term
103
104 def sample_h_given_v(self, v0_sample):
105 ''' This function infers state of hidden units given visible units '''
106 # compute the activation of the hidden units given a sample of the visibles
107 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
108 # get a sample of the hiddens given their activation
109 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
110 return [h1_mean, h1_sample]
111
112 def sample_v_given_h(self, h0_sample):
113 ''' This function infers state of visible units given hidden units '''
114 # compute the activation of the visible given the hidden sample
115 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
116 # get a sample of the visible given their activation
117 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
118 return [v1_mean, v1_sample]
119
120 def gibbs_hvh(self, h0_sample):
121 ''' This function implements one step of Gibbs sampling,
122 starting from the hidden state'''
123 v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
124 h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
125 return [v1_mean, v1_sample, h1_mean, h1_sample]
126
127 def gibbs_vhv(self, v0_sample):
128 ''' This function implements one step of Gibbs sampling,
129 starting from the visible state'''
130 h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
131 v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
132 return [h1_mean, h1_sample, v1_mean, v1_sample]
133
134 def cd(self, lr = 0.1, persistent=None):
135 """
136 This functions implements one step of CD-1 or PCD-1
137
138 :param lr: learning rate used to train the RBM
139 :param persistent: None for CD. For PCD, shared variable containing old state
140 of Gibbs chain. This must be a shared variable of size (batch size, number of
141 hidden units).
142
143 Returns the updates dictionary. The dictionary contains the update rules for weights
144 and biases but also an update of the shared variable used to store the persistent
145 chain, if one is used.
146 """
147
148 # compute positive phase
149 ph_mean, ph_sample = self.sample_h_given_v(self.input)
150
151 # decide how to initialize persistent chain:
152 # for CD, we use the newly generate hidden sample
153 # for PCD, we initialize from the old state of the chain
154 if persistent is None:
155 chain_start = ph_sample
156 else:
157 chain_start = persistent
158
159 # perform actual negative phase
160 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
161
162 # determine gradients on RBM parameters
163 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
164 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size
165 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \
166 T.dot(nh_mean.T, nv_mean )/ self.batch_size
167
168 gparams = [g_W.T, g_hbias, g_vbias]
169
170 # constructs the update dictionary
171 updates = {}
172 for gparam, param in zip(gparams, self.params):
173 updates[param] = param + gparam * lr
174
175 if persistent:
176 # Note that this works only if persistent is a shared variable
177 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX)
178 # pseudo-likelihood is a better proxy for PCD
179 cost = self.get_pseudo_likelihood_cost(updates)
180 else:
181 # reconstruction cross-entropy is a better proxy for CD
182 cost = self.get_reconstruction_cost(updates, nv_mean)
183
184 return cost, updates
185
186 def get_pseudo_likelihood_cost(self, updates):
187 """Stochastic approximation to the pseudo-likelihood"""
188
189 # index of bit i in expression p(x_i | x_{\i})
190 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx')
191
192 # binarize the input image by rounding to nearest integer
193 xi = T.iround(self.input)
194
195 # calculate free energy for the given bit configuration
196 fe_xi = self.free_energy(xi)
197
198 # flip bit x_i of matrix xi and preserve all other bits x_{\i}
199 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
200 # NB: slice(start,stop,step) is the python object used for
201 # slicing, e.g. to index matrix x as follows: x[start:stop:step]
202 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx],
203 idx_list=(slice(None,None,None),bit_i_idx))
204
205 # calculate free energy with bit flipped
206 fe_xi_flip = self.free_energy(xi_flip)
207
208 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
209 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))
210
211 # increment bit_i_idx % number as part of updates
212 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
213
214 return cost
215
216 def get_reconstruction_cost(self, updates, nv_mean):
217 """Approximation to the reconstruction error"""
218
219 cross_entropy = T.mean(
220 T.sum(self.input*T.log(nv_mean) +
221 (1 - self.input)*T.log(1-nv_mean), axis = 1))
222
223 return cross_entropy
224
225
226