# HG changeset patch
# User Dumitru Erhan <dumitru.erhan@gmail.com>
# Date 1264390413 18000
# Node ID 0fda55a7de99c9b08ce0f36acd56b9f00f2d857c
# Parent  fda5f787baa63270fa9e7cded01700db1a544397
removing files that are not ready yet

diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/convolutional_mlp.py
--- a/code_tutoriel/convolutional_mlp.py	Thu Jan 21 11:26:43 2010 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,230 +0,0 @@
-
-"""
-This tutorial introduces the LeNet5 neural network architecture using Theano.  LeNet5 is a
-convolutional neural network, good for classifying images. This tutorial shows how to build the
-architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST
-results.
-
-The best results are obtained after X iterations of the main program loop, which takes ***
-minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an
-NVIDIA GTX 285 graphics processor).
-
-This implementation simplifies the model in the following ways:
-
- - LeNetConvPool doesn't implement location-specific gain and bias parameters
-
- - LeNetConvPool doesn't implement pooling by average, it implements pooling by max.
-
- - Digit classification is implemented with a logistic regression rather than an RBF network
-
- - LeNet5 was not fully-connected convolutions at second layer
-
-References:
-
- - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document
-   Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
-   http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
-
-
-"""
-import numpy
-from theano.compile.sandbox import shared, pfunc
-from theano import tensor
-from pylearn.shared.layers import LogisticRegression, SigmoidalLayer
-import theano.sandbox.softsign
-import pylearn.datasets.MNIST
-
-
-try:
-    # this tells theano to use the GPU if possible
-    from theano.sandbox.cuda import use
-    use()
-except Exception, e:
-    print('Warning: Attempt to use GPU resulted in error "%s"' % str(e))
-
-class LeNetConvPool(object):
-    """WRITEME 
-
-    Math of what the layer does, and what symbolic variables are created by the class (w, b,
-    output).
-
-    """
-
-    #TODO: implement biases & scales properly. There are supposed to be more parameters.
-    #    - one bias & scale per filter
-    #    - one bias & scale per downsample feature location (a 2d bias)
-    #    - more?
-
-    def __init__(self, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape=(5,5),
-            poolsize=(2,2)):
-        """
-        Allocate a LeNetConvPool layer with shared variable internal parameters.
-
-        :param rng: a random number generator used to initialize weights
-        
-        :param input: symbolic images.  Shape: (n_examples, n_imgs, img_shape[0], img_shape[1])
-
-        :param n_examples: input's shape[0] at runtime
-
-        :param n_imgs: input's shape[1] at runtime
-
-        :param img_shape: input's shape[2:4] at runtime
-
-        :param n_filters: the number of filters to apply to the image.
-
-        :param filter_shape: the size of the filters to apply
-        :type filter_shape: pair (rows, cols)
-
-        :param poolsize: the downsampling (pooling) factor
-        :type poolsize: pair (rows, cols)
-        """
-
-        #TODO: make a simpler convolution constructor!!
-        #    - make dx and dy optional
-        #    - why do we have to pass shapes? (Can we make them optional at least?)
-        conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples,
-                dx=1, dy=1, output_mode='valid')
-
-        # - why is poolsize an op parameter here?
-        # - can we just have a maxpool function that creates this Op internally?
-        ds_op = DownsampleFactorMax(poolsize, ignore_border=True)
-
-        # the filter tensor that we will apply is a 4D tensor
-        w_shp = (n_filters, n_imgs) + filter_shape
-
-        # the bias we add is a 1D tensor
-        b_shp = (n_filters,)
-
-        self.w = shared(
-                numpy.asarray(
-                    rng.uniform(
-                        low=-1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), 
-                        high=1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs),
-                        size=w_shp), 
-                    dtype=input.dtype))
-        self.b = shared(
-                numpy.asarray(
-                    rng.uniform(low=-.0, high=0., size=(n_filters,)),
-                    dtype=input.dtype))
-
-        self.input = input
-        conv_out = conv_op(input, self.w)
-        self.output = tensor.tanh(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x'))
-        self.params = [self.w, self.b]
-
-class SigmoidalLayer(object):
-    def __init__(self, input, n_in, n_out):
-        """
-        :param input: a symbolic tensor of shape (n_examples, n_in)
-        :param w: a symbolic weight matrix of shape (n_in, n_out)
-        :param b: symbolic bias terms of shape (n_out,)
-        :param squash: an squashing function
-        """
-        self.input = input
-        self.w = shared(
-                numpy.asarray(
-                    rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in),
-                    size=(n_in, n_out)), dtype=input.dtype))
-        self.b = shared(numpy.asarray(numpy.zeros(n_out), dtype=input.dtype))
-        self.output = tensor.tanh(tensor.dot(input, self.w) + self.b)
-        self.params = [self.w, self.b]
-
-class LogisticRegression(object):
-    """WRITEME"""
-
-    def __init__(self, input, n_in, n_out):
-        self.w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype))
-        self.b = shared(numpy.zeros((n_out,), dtype=input.dtype))
-        self.l1=abs(self.w).sum()
-        self.l2_sqr = (self.w**2).sum()
-        self.output=nnet.softmax(theano.dot(input, self.w)+self.b)
-        self.argmax=theano.tensor.argmax(self.output, axis=1)
-        self.params = [self.w, self.b]
-
-    def nll(self, target):
-        """Return the negative log-likelihood of the prediction of this model under a given
-        target distribution.  Passing symbolic integers here means 1-hot.
-        WRITEME
-        """
-        return nnet.categorical_crossentropy(self.output, target)
-
-    def errors(self, target):
-        """Return a vector of 0s and 1s, with 1s on every line that was mis-classified.
-        """
-        if target.ndim != self.argmax.ndim:
-            raise TypeError('target should have the same shape as self.argmax', ('target', target.type,
-                'argmax', self.argmax.type))
-        if target.dtype.startswith('int'):
-            return theano.tensor.neq(self.argmax, target)
-        else:
-            raise NotImplementedError()
-
-def evaluate_lenet5(batch_size=30, n_iter=1000):
-    rng = numpy.random.RandomState(23455)
-
-    mnist = pylearn.datasets.MNIST.train_valid_test()
-
-    ishape=(28,28) #this is the size of MNIST images
-
-    # allocate symbolic variables for the data
-    x = tensor.fmatrix()  # the data is presented as rasterized images
-    y = tensor.lvector()  # the labels are presented as 1D vector of [long int] labels
-
-    # construct the first convolutional pooling layer
-    layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size, 
-            n_imgs=1, img_shape=ishape, 
-            n_filters=6, filter_shape=(5,5), 
-            poolsize=(2,2))
-
-    # construct the second convolutional pooling layer
-    layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, 
-            n_imgs=6, img_shape=(12,12),
-            n_filters=16, filter_shape=(5,5),
-            poolsize=(2,2))
-
-    # construct a fully-connected sigmoidal layer
-    layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ?
-
-    # classify the values of the fully-connected sigmoidal layer
-    layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10)
-
-    # the cost we minimize during training is the NLL of the model
-    cost = layer3.nll(y).mean()
-
-    # create a function to compute the mistakes that are made by the model
-    test_model = pfunc([x,y], layer3.errors(y))
-
-    # create a list of all model parameters to be fit by gradient descent
-    params = layer3.params+ layer2.params+ layer1.params + layer0.params
-    learning_rate = numpy.asarray(0.01, dtype='float32')
-
-    # train_model is a function that updates the model parameters by SGD
-    train_model = pfunc([x, y], cost, 
-            updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))])
-
-    # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE?
-
-    best_valid_score = float('inf')
-    for i in xrange(n_iter):
-        for j in xrange(len(mnist.train.x)/batch_size):
-            cost_ij = train_model(
-                    mnist.train.x[j*batch_size:(j+1)*batch_size],
-                    mnist.train.y[j*batch_size:(j+1)*batch_size])
-            #if 0 == j % 100:
-                #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij))
-        valid_score = numpy.mean([test_model(
-                    mnist.valid.x[j*batch_size:(j+1)*batch_size],
-                    mnist.valid.y[j*batch_size:(j+1)*batch_size])
-                for j in xrange(len(mnist.valid.x)/batch_size)])
-        print('epoch %i, validation error %f' % (i, valid_score))
-        if valid_score < best_valid_score:
-            best_valid_score = valid_score
-            test_score = numpy.mean([test_model(
-                        mnist.test.x[j*batch_size:(j+1)*batch_size],
-                        mnist.test.y[j*batch_size:(j+1)*batch_size])
-                    for j in xrange(len(mnist.test.x)/batch_size)])
-            print('epoch %i, test error of best model %f' % (i, test_score))
-
-if __name__ == '__main__':
-    evaluate_lenet5()
-
diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/dae.py
--- a/code_tutoriel/dae.py	Thu Jan 21 11:26:43 2010 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,240 +0,0 @@
-"""
- This tutorial introduces denoising auto-encoders using Theano. 
-
- Denoising autoencoders can be used as building blocks for deep networks. 
- They are based on auto-encoders as the ones used in Bengio et al. 2007.
- An autoencoder takes an input x and first maps it to a hidden representation
- y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
- latent representation y is then mapped back to a "reconstructed" vector 
- z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
- matrix W' can optionally be constrained such that W' = W^T, in which case 
- the autoencoder is said to have tied weights. The network is trained such 
- that to minimize the reconstruction error (the error between x and z).
-
- For the denosing autoencoder, during training, first x is corrupted into 
- \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
- of a stochastic mapping. Afterwards y is computed as before (using 
- \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
- error is now measured between z and the uncorrupted input x, which is 
- computed as the cross-entropy : 
-      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
- 
- For X iteration of the main program loop it takes *** minutes on an 
- Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
-
-
- References :
-   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
-   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
-   2008
-   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
-   Training of Deep Networks, Advances in Neural Information Processing 
-   Systems 19, 2007
-
-"""
-
-import numpy 
-from theano import tensor
-from theano.compile.sandbox import shared, pfunc
-from theano.compile.sandbox.shared_randomstreams import RandomStreams
-from theano.tensor import nnet
-import pylearn.datasets.MNIST
-
-
-try:
-    #this tells theano to use the GPU if possible
-    from theano.sandbox.cuda import use
-    use()
-except Exception,e:
-    print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e))
-
-
-def load_mnist_batches(batch_size):
-    """
-    We should remove the dependency on pylearn.datasets.MNIST .. and maybe
-    provide a pickled version of the dataset.. 
-    """
-    mnist = pylearn.datasets.MNIST.train_valid_test()
-    train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size])
-            for i in xrange(0, len(mnist.train.x), batch_size)]
-    valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size])
-            for i in xrange(0, len(mnist.valid.x), batch_size)]
-    test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size])
-            for i in xrange(0, len(mnist.test.x), batch_size)]
-    return train_batches, valid_batches, test_batches
-
-
-
-
-class DAE():
-  """Denoising Auto-Encoder class 
-
-  A denoising autoencoders tried to reconstruct the input from a corrupted 
-  version of it by projecting it first in a latent space and reprojecting 
-  it in the input space. Please refer to Vincent et al.,2008 for more 
-  details. If x is the input then equation (1) computes a partially destroyed
-  version of x by means of a stochastic mapping q_D. Equation (2) computes 
-  the projection of the input into the latent space. Equation (3) computes 
-  the reconstruction of the input, while equation (4) computes the 
-  reconstruction error.
-  
-  .. latex-eqn:
-    \tilde{x} ~ q_D(\tilde{x}|x)                                         (1)
-    y = s(W \tilde{x} + b)                                               (2)
-    x = s(W' y  + b')                                                    (3)
-    L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]          (4)
-
-  Tricks and thumbrules for DAE 
-     - learning rate should be used in a logarithmic scale ...
-  """
-
-  def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None):
-    """
-    Initialize the DAE class by specifying the number of visible units (the 
-    dimension d of the input ), the number of hidden units ( the dimension 
-    d' of the latent or hidden space ), a initial value for the learning rate
-    and by giving a symbolic description of the input. Such a symbolic 
-    description is of no importance for the simple DAE and therefore can be 
-    ignored. This feature is useful when stacking DAEs, since the input of 
-    intermediate layers can be symbolically described in terms of the hidden
-    units of the previous layer. See the tutorial on SDAE for more details.
-    
-    :param n_visible: number of visible units
-    :param n_hidden:  number of hidden units
-    :param lr:        a initial value for the learning rate
-    :param input:     a symbolic description of the input or None 
-    """
-    self.n_visible = n_visible
-    self.n_hidden  = n_hidden
-    
-    # create a Theano random generator that gives symbolic random values
-    theano_rng = RandomStreams( seed = 1234 )
-    # create a numpy random generator
-    numpy_rng = numpy.random.RandomState( seed = 52432 )
-    
-     
-    # initial values for weights and biases
-    # note : W' was written as W_prime and b' as b_prime
-    initial_W       = numpy_rng.uniform(size = (n_visible, n_hidden))
-    # transform W such that all values are between -.01 and .01
-    initial_W       = (initial_W*2.0       - 1.0)*.01 
-    initial_b       = numpy.zeros(n_hidden)
-    initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible))
-    # transform W_prime such that all values are between -.01 and .01
-    initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 
-    initial_b_prime= numpy.zeros(n_visible)
-     
-    
-    # theano shared variables for weights and biases
-    self.W       = shared(value = initial_W      , name = "W")
-    self.b       = shared(value = initial_b      , name = "b")
-    self.W_prime = shared(value = initial_W_prime, name = "W'") 
-    self.b_prime = shared(value = initial_b_prime, name = "b'")
-
-    # theano shared variable for the learning rate 
-    self.lr      = shared(value = lr             , name = "learning_rate")
-      
-    # if no input is given generate a variable representing the input
-    if input == None : 
-        # we use a matrix because we expect a minibatch of several examples,
-        # each example being a row
-        x = tensor.dmatrix(name = 'input') 
-    else:
-        x = input
-    # Equation (1)
-    # note : first argument of theano.rng.binomial is the shape(size) of 
-    #        random numbers that it should produce
-    #        second argument is the number of trials 
-    #        third argument is the probability of success of any trial
-    #
-    #        this will produce an array of 0s and 1s where 1 has a 
-    #        probability of 0.9 and 0 if 0.1
-    tilde_x  = theano_rng.binomial( x.shape,  1,  0.9) * x
-    # Equation (2)
-    # note  : y is stored as an attribute of the class so that it can be 
-    #         used later when stacking DAEs. 
-    self.y   = nnet.sigmoid(tensor.dot(tilde_x, self.W      ) + self.b)
-    # Equation (3)
-    z        = nnet.sigmoid(tensor.dot(self.y,  self.W_prime) + self.b_prime)
-    # Equation (4)
-    L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) 
-    # note : L is now a vector, where each element is the cross-entropy cost 
-    #        of the reconstruction of the corresponding example of the 
-    #        minibatch. We need to sum all these to get the cost of the
-    #        minibatch
-    cost = tensor.sum(L)
-    # parameters with respect to whom we need to compute the gradient
-    self.params = [ self.W, self.b, self.W_prime, self.b_prime]
-    # use theano automatic differentiation to get the gradients
-    gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params)
-    # update the parameters in the direction of the gradient using the 
-    # learning rate
-    updated_W       = self.W       - gW       * self.lr
-    updated_b       = self.b       - gb       * self.lr
-    updated_W_prime = self.W_prime - gW_prime * self.lr
-    updated_b_prime = self.b_prime - gb_prime * self.lr
-
-    # defining the function that evaluate the symbolic description of 
-    # one update step 
-    self.update  = pfunc(params = [x], outputs = cost, updates = 
-                                { self.W       : updated_W, 
-                                  self.b       : updated_b,
-                                  self.W_prime : updated_W_prime,
-                                  self.b_prime : updated_b_prime } )
-    self.get_cost = pfunc(params = [x], outputs = cost)
-
-
-
-
-
-
-
-
-
-   
-
-def train_DAE_mnist():
-  """
-  Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist)
-  """
-
-  # load dataset as batches  
-  train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16)
-
-  # Create a denoising auto-encoders with 28*28 = 784 input units, and 500
-  # units in the hidden layer (latent layer); Learning rate is set to 1e-1
-  dae = DAE( n_visible = 784,  n_hidden = 500, lr = 1e-2)
-
-  # Number of iterations (epochs) to run
-  n_iter = 30
-  best_valid_score = float('inf')
-  test_score       = float('inf')
-  for i in xrange(n_iter):
-    # train once over the dataset
-    for x,y in train_batches:
-        cost = dae.update(x)
-     
-    # compute validation error
-    valid_cost = 0.
-    for x,y in valid_batches:
-        valid_cost = valid_cost + dae.get_cost(x)
-    valid_cost = valid_cost / len(valid_batches)
-    print('epoch %i, validation reconstruction error %f '%(i,valid_cost))
-
-    if valid_cost < best_valid_score :
-        best_valid_score = valid_cost
-        # compute test error !?
-        test_score = 0.
-        for x,y in test_batches:
-            test_score = test_score + dae.get_cost(x)
-        test_score = test_score / len(test_batches)
-        print('epoch %i, test error of best model %f' % (i, test_score))
-    
-  print('Optimization done. Best validation score %f, test performance %f' %
-            (best_valid_score, test_score))
-
-
-
-if __name__ == "__main__":
-    train_DAE_mnist()
-
diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/dbn.py
--- a/code_tutoriel/dbn.py	Thu Jan 21 11:26:43 2010 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-import numpy
-import theano
-import theano.tensor as T
-
-from deeplearning import rbm
-
-class DBN():
-
-    def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123):
-        assert vsize and hsizes and lr
-
-        input = T.dmatrix('global_input')
-
-        self.layers = []
-        for hsize in hsizes:
-            r = rbm.RBM(input=input, vsize=vsize, hsize=hsize, bsize=bsize,
-                        lr=lr, seed=seed)
-            self.layers.append(r)
-
-            # configure inputs for subsequent layer
-            input = self.layers[-1].hid
-            vsize = hsize
-
-
diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/rbm.py
--- a/code_tutoriel/rbm.py	Thu Jan 21 11:26:43 2010 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,133 +0,0 @@
-import numpy
-import theano
-import theano.tensor as T
-
-from theano.compile.sandbox.sharedvalue import shared
-from theano.compile.sandbox.pfunc import pfunc
-from theano.compile.sandbox.shared_randomstreams import RandomStreams
-from theano.tensor.nnet import sigmoid
-
-class A():
-
-    @execute
-    def propup();
-        # do symbolic prop
-        self.hid = T.dot(
-
-class RBM():
-
-    def __init__(self, input=None, vsize=None, hsize=None, bsize=10, lr=1e-1, seed=123):
-        """ 
-        RBM constructor. Defines the parameters of the model along with
-        basic operations for inferring hidden from visible (and vice-versa), as well
-        as for performing CD updates.
-        param input: None for standalone RBMs or symbolic variable if RBM is
-                     part of a larger graph.
-        param vsize: number of visible units
-        param hsize: number of hidden units
-        param bsize: size of minibatch
-        param lr: unsupervised learning rate
-        param seed: seed for random number generator
-        """
-        assert vsize and hsize
-
-        self.vsize = vsize
-        self.hsize = hsize
-        self.lr = shared(lr, 'lr')
-        
-        # setup theano random number generator
-        self.random = RandomStreams(seed)
-       
-        #### INITIALIZATION ####
-
-        # initialize input layer for standalone RBM or layer0 of DBN
-        self.input = input if input else T.dmatrix('input')
-        # initialize biases
-        self.b = shared(numpy.zeros(vsize), 'b')
-        self.c = shared(numpy.zeros(hsize), 'c')
-        # initialize random weights
-        rngseed = numpy.random.RandomState(seed).randint(2**30)
-        rng = numpy.random.RandomState(rngseed)
-        ubound = 1./numpy.sqrt(max(self.vsize,self.hsize))
-        self.w = shared(rng.uniform(low=-ubound, high=ubound, size=(hsize,vsize)), 'w')
-      
-
-        #### POSITIVE AND NEGATIVE PHASE ####
-
-        # define graph for positive phase
-        ph, ph_s = self.def_propup(self.input)
-        # function which computes p(h|v=x) and ~ p(h|v=x)
-        self.pos_phase = pfunc([self.input], [ph, ph_s])
-
-        # define graph for negative phase
-        nv, nv_s = self.def_propdown(ph_s)
-        nh, nh_s = self.def_propup(nv_s)
-        # function which computes p(v|h=ph_s), ~ p(v|h=ph_s) and p(h|v=nv_s)
-        self.neg_phase = pfunc([ph_s], [nv, nv_s, nh, nh_s])
-        
-        # calculate CD gradients for each parameter
-        db = T.mean(self.input, axis=0) - T.mean(nv, axis=0)
-        dc = T.mean(ph, axis=0) - T.mean(nh, axis=0)
-        dwp = T.dot(ph.T, self.input)/nv.shape[0]
-        dwn = T.dot(nh.T, nv)/nv.shape[0]
-        dw = dwp - dwn
-
-        # define dictionary of stochastic gradient update equations
-        updates = {self.b: self.b - self.lr * db,
-                   self.c: self.c - self.lr * dc,
-                   self.w: self.w - self.lr * dw}
-
-        # define private function, which performs one step in direction of CD gradient
-        self.cd_step = pfunc([self.input, ph, nv, nh], [], updates=updates)
-
-
-    def def_propup(self, vis):
-        """ Symbolic definition of p(hid|vis) """
-        hid_activation = T.dot(vis, self.w.T) + self.c
-        hid = sigmoid(hid_activation)
-        hid_sample = self.random.binomial(T.shape(hid), 1, hid)*1.0
-        return hid, hid_sample
-    
-    def def_propdown(self, hid):
-        """ Symbolic definition of p(vis|hid) """
-        vis_activation = T.dot(hid, self.w) + self.b
-        vis = sigmoid(vis_activation)
-        vis_sample = self.random.binomial(T.shape(vis), 1, vis)*1.0
-        return vis, vis_sample
-
-    def cd(self, x, k=1):
-        """ Performs actual CD update """
-        ph, ph_s = self.pos_phase(x)
-        
-        nh_s = ph_s
-        for ki in range(k):
-            nv, nv_s, nh, nh_s = self.neg_phase(nh_s)
-
-        self.cd_step(x, ph, nv_s, nh)
-
-
-
-import os
-from pylearn.datasets import MNIST
-
-if __name__ == '__main__':
-
-    bsize = 10
-
-    # initialize dataset
-    dataset = MNIST.first_1k() 
-    # initialize RBM with 784 visible units and 500 hidden units
-    r = RBM(vsize=784, hsize=500, bsize=bsize, lr=0.1)
-
-    # for a fixed number of epochs ...
-    for e in range(10):
-
-        print '@epoch %i ' % e
-
-        # iterate over all training set mini-batches
-        for i in range(len(dataset.train.x)/bsize):
-
-            rng = range(i*bsize,(i+1)*bsize) # index range of subsequent mini-batch
-            x = dataset.train.x[rng]         # next mini-batch
-            r.cd(x)                          # perform cd update
-