347
|
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano.
|
|
2
|
|
3 Boltzmann Machines (BMs) are a particular form of energy-based model which
|
|
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs
|
|
5 to those without visible-visible and hidden-hidden connections.
|
|
6 """
|
|
7
|
|
8
|
|
9 import numpy, time, cPickle, gzip, PIL.Image
|
|
10
|
|
11 import theano
|
|
12 import theano.tensor as T
|
|
13 import os
|
|
14
|
|
15 from theano.tensor.shared_randomstreams import RandomStreams
|
|
16
|
|
17 from utils import tile_raster_images
|
|
18 from logistic_sgd import load_data
|
|
19
|
|
20
|
|
21 class RBM(object):
|
|
22 """Restricted Boltzmann Machine (RBM) """
|
|
23 def __init__(self, input=None, n_visible=784, n_hidden=1000, \
|
|
24 W = None, hbias = None, vbias = None, numpy_rng = None,
|
|
25 theano_rng = None):
|
|
26 """
|
|
27 RBM constructor. Defines the parameters of the model along with
|
|
28 basic operations for inferring hidden from visible (and vice-versa),
|
|
29 as well as for performing CD updates.
|
|
30
|
|
31 :param input: None for standalone RBMs or symbolic variable if RBM is
|
|
32 part of a larger graph.
|
|
33
|
|
34 :param n_visible: number of visible units
|
|
35
|
|
36 :param n_hidden: number of hidden units
|
|
37
|
|
38 :param W: None for standalone RBMs or symbolic variable pointing to a
|
|
39 shared weight matrix in case RBM is part of a DBN network; in a DBN,
|
|
40 the weights are shared between RBMs and layers of a MLP
|
|
41
|
|
42 :param hbias: None for standalone RBMs or symbolic variable pointing
|
|
43 to a shared hidden units bias vector in case RBM is part of a
|
|
44 different network
|
|
45
|
|
46 :param vbias: None for standalone RBMs or a symbolic variable
|
|
47 pointing to a shared visible units bias
|
|
48 """
|
|
49
|
|
50 self.n_visible = n_visible
|
|
51 self.n_hidden = n_hidden
|
|
52
|
|
53
|
|
54 if W is None :
|
|
55 # W is initialized with `initial_W` which is uniformely sampled
|
|
56 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
|
|
57 # the output of uniform if converted using asarray to dtype
|
|
58 # theano.config.floatX so that the code is runable on GPU
|
|
59 initial_W = numpy.asarray( numpy.random.uniform(
|
|
60 low = -numpy.sqrt(6./(n_hidden+n_visible)),
|
|
61 high = numpy.sqrt(6./(n_hidden+n_visible)),
|
|
62 size = (n_visible, n_hidden)),
|
|
63 dtype = theano.config.floatX)
|
|
64 # theano shared variables for weights and biases
|
|
65 W = theano.shared(value = initial_W, name = 'W')
|
|
66
|
|
67 if hbias is None :
|
|
68 # create shared variable for hidden units bias
|
|
69 hbias = theano.shared(value = numpy.zeros(n_hidden,
|
|
70 dtype = theano.config.floatX), name='hbias')
|
|
71
|
|
72 if vbias is None :
|
|
73 # create shared variable for visible units bias
|
|
74 vbias = theano.shared(value =numpy.zeros(n_visible,
|
|
75 dtype = theano.config.floatX),name='vbias')
|
|
76
|
|
77 if numpy_rng is None:
|
|
78 # create a number generator
|
|
79 numpy_rng = numpy.random.RandomState(1234)
|
|
80
|
|
81 if theano_rng is None :
|
|
82 theano_rng = RandomStreams(numpy_rng.randint(2**30))
|
|
83
|
|
84
|
|
85 # initialize input layer for standalone RBM or layer0 of DBN
|
|
86 self.input = input if input else T.dmatrix('input')
|
|
87
|
|
88 self.W = W
|
|
89 self.hbias = hbias
|
|
90 self.vbias = vbias
|
|
91 self.theano_rng = theano_rng
|
|
92 # **** WARNING: It is not a good idea to put things in this list
|
|
93 # other than shared variables created in this function.
|
|
94 self.params = [self.W, self.hbias, self.vbias]
|
|
95 self.batch_size = self.input.shape[0]
|
|
96
|
|
97 def free_energy(self, v_sample):
|
|
98 ''' Function to compute the free energy '''
|
|
99 wx_b = T.dot(v_sample, self.W) + self.hbias
|
|
100 vbias_term = T.sum(T.dot(v_sample, self.vbias))
|
|
101 hidden_term = T.sum(T.log(1+T.exp(wx_b)))
|
|
102 return -hidden_term - vbias_term
|
|
103
|
|
104 def sample_h_given_v(self, v0_sample):
|
|
105 ''' This function infers state of hidden units given visible units '''
|
|
106 # compute the activation of the hidden units given a sample of the visibles
|
|
107 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
|
|
108 # get a sample of the hiddens given their activation
|
|
109 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
|
|
110 return [h1_mean, h1_sample]
|
|
111
|
|
112 def sample_v_given_h(self, h0_sample):
|
|
113 ''' This function infers state of visible units given hidden units '''
|
|
114 # compute the activation of the visible given the hidden sample
|
|
115 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
|
|
116 # get a sample of the visible given their activation
|
|
117 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
|
|
118 return [v1_mean, v1_sample]
|
|
119
|
|
120 def gibbs_hvh(self, h0_sample):
|
|
121 ''' This function implements one step of Gibbs sampling,
|
|
122 starting from the hidden state'''
|
|
123 v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
|
|
124 h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
|
|
125 return [v1_mean, v1_sample, h1_mean, h1_sample]
|
|
126
|
|
127 def gibbs_vhv(self, v0_sample):
|
|
128 ''' This function implements one step of Gibbs sampling,
|
|
129 starting from the visible state'''
|
|
130 h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
|
|
131 v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
|
|
132 return [h1_mean, h1_sample, v1_mean, v1_sample]
|
|
133
|
|
134 def cd(self, lr = 0.1, persistent=None):
|
|
135 """
|
|
136 This functions implements one step of CD-1 or PCD-1
|
|
137
|
|
138 :param lr: learning rate used to train the RBM
|
|
139 :param persistent: None for CD. For PCD, shared variable containing old state
|
|
140 of Gibbs chain. This must be a shared variable of size (batch size, number of
|
|
141 hidden units).
|
|
142
|
|
143 Returns the updates dictionary. The dictionary contains the update rules for weights
|
|
144 and biases but also an update of the shared variable used to store the persistent
|
|
145 chain, if one is used.
|
|
146 """
|
|
147
|
|
148 # compute positive phase
|
|
149 ph_mean, ph_sample = self.sample_h_given_v(self.input)
|
|
150
|
|
151 # decide how to initialize persistent chain:
|
|
152 # for CD, we use the newly generate hidden sample
|
|
153 # for PCD, we initialize from the old state of the chain
|
|
154 if persistent is None:
|
|
155 chain_start = ph_sample
|
|
156 else:
|
|
157 chain_start = persistent
|
|
158
|
|
159 # perform actual negative phase
|
|
160 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
|
|
161
|
|
162 # determine gradients on RBM parameters
|
|
163 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
|
|
164 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size
|
|
165 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \
|
|
166 T.dot(nh_mean.T, nv_mean )/ self.batch_size
|
|
167
|
|
168 gparams = [g_W.T, g_hbias, g_vbias]
|
|
169
|
|
170 # constructs the update dictionary
|
|
171 updates = {}
|
|
172 for gparam, param in zip(gparams, self.params):
|
|
173 updates[param] = param + gparam * lr
|
|
174
|
|
175 if persistent:
|
|
176 # Note that this works only if persistent is a shared variable
|
|
177 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX)
|
|
178 # pseudo-likelihood is a better proxy for PCD
|
|
179 cost = self.get_pseudo_likelihood_cost(updates)
|
|
180 else:
|
|
181 # reconstruction cross-entropy is a better proxy for CD
|
|
182 cost = self.get_reconstruction_cost(updates, nv_mean)
|
|
183
|
|
184 return cost, updates
|
|
185
|
|
186 def get_pseudo_likelihood_cost(self, updates):
|
|
187 """Stochastic approximation to the pseudo-likelihood"""
|
|
188
|
|
189 # index of bit i in expression p(x_i | x_{\i})
|
|
190 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx')
|
|
191
|
|
192 # binarize the input image by rounding to nearest integer
|
|
193 xi = T.iround(self.input)
|
|
194
|
|
195 # calculate free energy for the given bit configuration
|
|
196 fe_xi = self.free_energy(xi)
|
|
197
|
|
198 # flip bit x_i of matrix xi and preserve all other bits x_{\i}
|
|
199 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
|
|
200 # NB: slice(start,stop,step) is the python object used for
|
|
201 # slicing, e.g. to index matrix x as follows: x[start:stop:step]
|
|
202 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx],
|
|
203 idx_list=(slice(None,None,None),bit_i_idx))
|
|
204
|
|
205 # calculate free energy with bit flipped
|
|
206 fe_xi_flip = self.free_energy(xi_flip)
|
|
207
|
|
208 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
|
|
209 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))
|
|
210
|
|
211 # increment bit_i_idx % number as part of updates
|
|
212 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
|
|
213
|
|
214 return cost
|
|
215
|
|
216 def get_reconstruction_cost(self, updates, nv_mean):
|
|
217 """Approximation to the reconstruction error"""
|
|
218
|
|
219 cross_entropy = T.mean(
|
|
220 T.sum(self.input*T.log(nv_mean) +
|
|
221 (1 - self.input)*T.log(1-nv_mean), axis = 1))
|
|
222
|
|
223 return cross_entropy
|
|
224
|
|
225
|
|
226
|