Mercurial > ift6266
comparison deep/rbm/rbm.py @ 347:9685e9d94cc4
base class for an rbm
author | goldfinger |
---|---|
date | Mon, 19 Apr 2010 08:16:56 -0400 |
parents | |
children | d81284e13d77 |
comparison
equal
deleted
inserted
replaced
338:fca22114bb23 | 347:9685e9d94cc4 |
---|---|
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano. | |
2 | |
3 Boltzmann Machines (BMs) are a particular form of energy-based model which | |
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs | |
5 to those without visible-visible and hidden-hidden connections. | |
6 """ | |
7 | |
8 | |
9 import numpy, time, cPickle, gzip, PIL.Image | |
10 | |
11 import theano | |
12 import theano.tensor as T | |
13 import os | |
14 | |
15 from theano.tensor.shared_randomstreams import RandomStreams | |
16 | |
17 from utils import tile_raster_images | |
18 from logistic_sgd import load_data | |
19 | |
20 | |
21 class RBM(object): | |
22 """Restricted Boltzmann Machine (RBM) """ | |
23 def __init__(self, input=None, n_visible=784, n_hidden=1000, \ | |
24 W = None, hbias = None, vbias = None, numpy_rng = None, | |
25 theano_rng = None): | |
26 """ | |
27 RBM constructor. Defines the parameters of the model along with | |
28 basic operations for inferring hidden from visible (and vice-versa), | |
29 as well as for performing CD updates. | |
30 | |
31 :param input: None for standalone RBMs or symbolic variable if RBM is | |
32 part of a larger graph. | |
33 | |
34 :param n_visible: number of visible units | |
35 | |
36 :param n_hidden: number of hidden units | |
37 | |
38 :param W: None for standalone RBMs or symbolic variable pointing to a | |
39 shared weight matrix in case RBM is part of a DBN network; in a DBN, | |
40 the weights are shared between RBMs and layers of a MLP | |
41 | |
42 :param hbias: None for standalone RBMs or symbolic variable pointing | |
43 to a shared hidden units bias vector in case RBM is part of a | |
44 different network | |
45 | |
46 :param vbias: None for standalone RBMs or a symbolic variable | |
47 pointing to a shared visible units bias | |
48 """ | |
49 | |
50 self.n_visible = n_visible | |
51 self.n_hidden = n_hidden | |
52 | |
53 | |
54 if W is None : | |
55 # W is initialized with `initial_W` which is uniformely sampled | |
56 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) | |
57 # the output of uniform if converted using asarray to dtype | |
58 # theano.config.floatX so that the code is runable on GPU | |
59 initial_W = numpy.asarray( numpy.random.uniform( | |
60 low = -numpy.sqrt(6./(n_hidden+n_visible)), | |
61 high = numpy.sqrt(6./(n_hidden+n_visible)), | |
62 size = (n_visible, n_hidden)), | |
63 dtype = theano.config.floatX) | |
64 # theano shared variables for weights and biases | |
65 W = theano.shared(value = initial_W, name = 'W') | |
66 | |
67 if hbias is None : | |
68 # create shared variable for hidden units bias | |
69 hbias = theano.shared(value = numpy.zeros(n_hidden, | |
70 dtype = theano.config.floatX), name='hbias') | |
71 | |
72 if vbias is None : | |
73 # create shared variable for visible units bias | |
74 vbias = theano.shared(value =numpy.zeros(n_visible, | |
75 dtype = theano.config.floatX),name='vbias') | |
76 | |
77 if numpy_rng is None: | |
78 # create a number generator | |
79 numpy_rng = numpy.random.RandomState(1234) | |
80 | |
81 if theano_rng is None : | |
82 theano_rng = RandomStreams(numpy_rng.randint(2**30)) | |
83 | |
84 | |
85 # initialize input layer for standalone RBM or layer0 of DBN | |
86 self.input = input if input else T.dmatrix('input') | |
87 | |
88 self.W = W | |
89 self.hbias = hbias | |
90 self.vbias = vbias | |
91 self.theano_rng = theano_rng | |
92 # **** WARNING: It is not a good idea to put things in this list | |
93 # other than shared variables created in this function. | |
94 self.params = [self.W, self.hbias, self.vbias] | |
95 self.batch_size = self.input.shape[0] | |
96 | |
97 def free_energy(self, v_sample): | |
98 ''' Function to compute the free energy ''' | |
99 wx_b = T.dot(v_sample, self.W) + self.hbias | |
100 vbias_term = T.sum(T.dot(v_sample, self.vbias)) | |
101 hidden_term = T.sum(T.log(1+T.exp(wx_b))) | |
102 return -hidden_term - vbias_term | |
103 | |
104 def sample_h_given_v(self, v0_sample): | |
105 ''' This function infers state of hidden units given visible units ''' | |
106 # compute the activation of the hidden units given a sample of the visibles | |
107 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) | |
108 # get a sample of the hiddens given their activation | |
109 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) | |
110 return [h1_mean, h1_sample] | |
111 | |
112 def sample_v_given_h(self, h0_sample): | |
113 ''' This function infers state of visible units given hidden units ''' | |
114 # compute the activation of the visible given the hidden sample | |
115 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) | |
116 # get a sample of the visible given their activation | |
117 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) | |
118 return [v1_mean, v1_sample] | |
119 | |
120 def gibbs_hvh(self, h0_sample): | |
121 ''' This function implements one step of Gibbs sampling, | |
122 starting from the hidden state''' | |
123 v1_mean, v1_sample = self.sample_v_given_h(h0_sample) | |
124 h1_mean, h1_sample = self.sample_h_given_v(v1_sample) | |
125 return [v1_mean, v1_sample, h1_mean, h1_sample] | |
126 | |
127 def gibbs_vhv(self, v0_sample): | |
128 ''' This function implements one step of Gibbs sampling, | |
129 starting from the visible state''' | |
130 h1_mean, h1_sample = self.sample_h_given_v(v0_sample) | |
131 v1_mean, v1_sample = self.sample_v_given_h(h1_sample) | |
132 return [h1_mean, h1_sample, v1_mean, v1_sample] | |
133 | |
134 def cd(self, lr = 0.1, persistent=None): | |
135 """ | |
136 This functions implements one step of CD-1 or PCD-1 | |
137 | |
138 :param lr: learning rate used to train the RBM | |
139 :param persistent: None for CD. For PCD, shared variable containing old state | |
140 of Gibbs chain. This must be a shared variable of size (batch size, number of | |
141 hidden units). | |
142 | |
143 Returns the updates dictionary. The dictionary contains the update rules for weights | |
144 and biases but also an update of the shared variable used to store the persistent | |
145 chain, if one is used. | |
146 """ | |
147 | |
148 # compute positive phase | |
149 ph_mean, ph_sample = self.sample_h_given_v(self.input) | |
150 | |
151 # decide how to initialize persistent chain: | |
152 # for CD, we use the newly generate hidden sample | |
153 # for PCD, we initialize from the old state of the chain | |
154 if persistent is None: | |
155 chain_start = ph_sample | |
156 else: | |
157 chain_start = persistent | |
158 | |
159 # perform actual negative phase | |
160 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) | |
161 | |
162 # determine gradients on RBM parameters | |
163 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size | |
164 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size | |
165 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ | |
166 T.dot(nh_mean.T, nv_mean )/ self.batch_size | |
167 | |
168 gparams = [g_W.T, g_hbias, g_vbias] | |
169 | |
170 # constructs the update dictionary | |
171 updates = {} | |
172 for gparam, param in zip(gparams, self.params): | |
173 updates[param] = param + gparam * lr | |
174 | |
175 if persistent: | |
176 # Note that this works only if persistent is a shared variable | |
177 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) | |
178 # pseudo-likelihood is a better proxy for PCD | |
179 cost = self.get_pseudo_likelihood_cost(updates) | |
180 else: | |
181 # reconstruction cross-entropy is a better proxy for CD | |
182 cost = self.get_reconstruction_cost(updates, nv_mean) | |
183 | |
184 return cost, updates | |
185 | |
186 def get_pseudo_likelihood_cost(self, updates): | |
187 """Stochastic approximation to the pseudo-likelihood""" | |
188 | |
189 # index of bit i in expression p(x_i | x_{\i}) | |
190 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') | |
191 | |
192 # binarize the input image by rounding to nearest integer | |
193 xi = T.iround(self.input) | |
194 | |
195 # calculate free energy for the given bit configuration | |
196 fe_xi = self.free_energy(xi) | |
197 | |
198 # flip bit x_i of matrix xi and preserve all other bits x_{\i} | |
199 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] | |
200 # NB: slice(start,stop,step) is the python object used for | |
201 # slicing, e.g. to index matrix x as follows: x[start:stop:step] | |
202 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], | |
203 idx_list=(slice(None,None,None),bit_i_idx)) | |
204 | |
205 # calculate free energy with bit flipped | |
206 fe_xi_flip = self.free_energy(xi_flip) | |
207 | |
208 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) | |
209 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) | |
210 | |
211 # increment bit_i_idx % number as part of updates | |
212 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible | |
213 | |
214 return cost | |
215 | |
216 def get_reconstruction_cost(self, updates, nv_mean): | |
217 """Approximation to the reconstruction error""" | |
218 | |
219 cross_entropy = T.mean( | |
220 T.sum(self.input*T.log(nv_mean) + | |
221 (1 - self.input)*T.log(1-nv_mean), axis = 1)) | |
222 | |
223 return cross_entropy | |
224 | |
225 | |
226 |