changeset 633:13baba8a4522

merge
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Sat, 19 Mar 2011 22:51:40 -0400
parents 5541056d3fb0 (current diff) 510220effb14 (diff)
children d2d7ce0f0942
files
diffstat 27 files changed, 6117 insertions(+), 61 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_generation/mnist_resized/rescale_mnist.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,46 @@
+import numpy,cPickle,gzip,Image,pdb,sys
+
+
+def zeropad(vect,img_size=(28,28),out_size=(32,32)):
+    delta = (numpy.abs(img_size[0]-out_size[0])/2,numpy.abs(img_size[1]-out_size[1])/2)
+    newvect = numpy.zeros(out_size)
+    newvect[delta[0]:-delta[0],delta[1]:-delta[1]] = vect.reshape(img_size)
+    return newvect.flatten()
+
+def rescale(vect,img_size=(28,28),out_size=(32,32), filter=Image.NEAREST):
+    im = Image.fromarray(numpy.asarray(vect.reshape(img_size)*255.,dtype='uint8'))
+    return (numpy.asarray(im.resize(out_size,filter),dtype='float32')/255.).flatten()
+
+ 
+#pdb.set_trace()
+def rescale_mnist(newsize=(32,32),output_file='mnist_rescaled_32_32.pkl',mnist=cPickle.load(gzip.open('mnist.pkl.gz'))):
+    newmnist = []
+    for set in mnist:
+        newset=numpy.zeros((len(set[0]),newsize[0]*newsize[1]))
+        for i in xrange(len(set[0])):
+            print i,
+            sys.stdout.flush()
+            newset[i] = rescale(set[0][i])
+        newmnist.append((newset,set[1]))
+    cPickle.dump(newmnist,open(output_file,'w'),protocol=-1)
+    print 'Done rescaling'
+
+
+def zeropad_mnist(newsize=(32,32),output_file='mnist_zeropadded_32_32.pkl',mnist=cPickle.load(gzip.open('mnist.pkl.gz'))):
+    newmnist = []
+    for set in mnist:
+        newset=numpy.zeros((len(set[0]),newsize[0]*newsize[1]))
+        for i in xrange(len(set[0])):
+            print i,
+            sys.stdout.flush()
+            newset[i] = zeropad(set[0][i])
+        newmnist.append((newset,set[1]))
+    cPickle.dump(newmnist,open(output_file,'w'),protocol=-1)
+    print 'Done padding'
+
+if __name__ =='__main__':
+    print 'Creating resized datasets'
+    mnist_ds = cPickle.load(gzip.open('mnist.pkl.gz'))
+    #zeropad_mnist(mnist=mnist_ds)
+    rescale_mnist(mnist=mnist_ds)
+    print 'Finished.'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_generation/pipeline/filter_nist.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,62 @@
+import numpy
+from pylearn.io import filetensor as ft
+from ift6266 import datasets
+from ift6266.datasets.ftfile import FTDataSet
+
+dataset_str = 'P07_' # NISTP # 'P07safe_' 
+
+#base_path = '/data/lisatmp/ift6266h10/data/'+dataset_str
+#base_output_path = '/data/lisatmp/ift6266h10/data/transformed_digits/'+dataset_str+'train'
+
+base_path = '/data/lisa/data/ift6266h10/data/'+dataset_str
+base_output_path = '/data/lisatmp/ift6266h10/data/transformed_digits/'+dataset_str+'train'
+
+for fileno in range(100):
+    print "Processing file no ", fileno
+
+    output_data_file = base_output_path+str(fileno)+'_data.ft'
+    output_labels_file = base_output_path+str(fileno)+'_labels.ft'
+
+    print "Reading from ",base_path+'train'+str(fileno)+'_data.ft'
+
+    dataset = lambda maxsize=None, min_file=0, max_file=100: \
+                    FTDataSet(train_data = [base_path+'train'+str(fileno)+'_data.ft'],
+                       train_lbl = [base_path+'train'+str(fileno)+'_labels.ft'],
+                       test_data = [base_path+'_test_data.ft'],
+                       test_lbl = [base_path+'_test_labels.ft'],
+                       valid_data = [base_path+'_valid_data.ft'],
+                       valid_lbl = [base_path+'_valid_labels.ft'])
+                       # no conversion or scaling... keep data as is
+                       #indtype=theano.config.floatX, inscale=255., maxsize=maxsize)
+
+    ds = dataset()
+
+    all_x = []
+    all_y = []
+
+    all_count = 0
+
+    for mb_x,mb_y in ds.train(1):
+        if mb_y[0] <= 9:
+            all_x.append(mb_x[0])
+            all_y.append(mb_y[0])
+
+        if (all_count+1) % 100000 == 0:
+            print "Done next 100k"
+
+        all_count += 1
+   
+    # data is stored as uint8 on 0-255
+    merged_x = numpy.asarray(all_x, dtype=numpy.uint8)
+    merged_y = numpy.asarray(all_y, dtype=numpy.int32)
+
+    print "Kept", len(all_x), "(shape ", merged_x.shape, ") examples from", all_count
+
+    f = open(output_data_file, 'wb')
+    ft.write(f, merged_x)
+    f.close()
+
+    f = open(output_labels_file, 'wb')
+    ft.write(f, merged_y)
+    f.close()
+    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_generation/pipeline/visualize_filtered.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,43 @@
+import numpy
+import pylab
+from pylearn.io import filetensor as ft
+from ift6266 import datasets
+from ift6266.datasets.ftfile import FTDataSet
+
+import time
+import matplotlib.cm as cm
+
+
+dataset_str = 'P07safe_' #'PNIST07_' # NISTP
+
+base_path = '/data/lisatmp/ift6266h10/data/'+dataset_str
+base_output_path = '/data/lisatmp/ift6266h10/data/transformed_digits/'+dataset_str+'train'
+
+fileno = 15
+
+output_data_file = base_output_path+str(fileno)+'_data.ft'
+output_labels_file = base_output_path+str(fileno)+'_labels.ft'
+
+dataset_obj = lambda maxsize=None, min_file=0, max_file=100: \
+                FTDataSet(train_data = [output_data_file],
+                   train_lbl = [output_labels_file],
+                   test_data = [base_path+'_test_data.ft'],
+                   test_lbl = [base_path+'_test_labels.ft'],
+                   valid_data = [base_path+'_valid_data.ft'],
+                   valid_lbl = [base_path+'_valid_labels.ft'])
+                   # no conversion or scaling... keep data as is
+                   #indtype=theano.config.floatX, inscale=255., maxsize=maxsize)
+
+dataset = dataset_obj()
+train_ds = dataset.train(1)
+
+for i in range(2983):
+    if i < 2900:
+        continue
+    ex = train_ds.next()
+    pylab.ion()
+    pylab.clf()
+    pylab.imshow(ex[0].reshape(32,32),cmap=cm.gray)
+    pylab.draw()
+    time.sleep(0.5)
+
--- a/datasets/ftfile.py	Sat Mar 19 22:49:33 2011 -0400
+++ b/datasets/ftfile.py	Sat Mar 19 22:51:40 2011 -0400
@@ -1,8 +1,12 @@
+from itertools import izip
+import os
+
+import numpy
 from pylearn.io.filetensor import _read_header, _prod
-import numpy, theano
+
 from dataset import DataSet
 from dsetiter import DataIterator
-from itertools import izip, imap
+
 
 class FTFile(object):
     def __init__(self, fname, scale=1, dtype=None):
@@ -10,8 +14,16 @@
         Tests:
             >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft')
         """
-        self.file = open(fname, 'rb')
-        self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False)
+        if os.path.exists(fname):
+            self.file = open(fname, 'rb')
+            self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False)
+            self.gz=False
+        else:
+            import gzip
+            self.file = gzip.open(fname+'.gz','rb')
+            self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False, True)
+            self.gz=True
+
         self.size = self.dim[0]
         self.scale = scale
         self.dtype = dtype
@@ -81,7 +93,11 @@
             num = self.size
         self.dim[0] = num
         self.size -= num
-        res = numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim)
+        if self.gz:
+            d = self.file.read(_prod(self.dim)*self.elsize)
+            res = numpy.fromstring(d, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim)
+        else:
+            res = numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim)
         if self.dtype is not None:
             res = res.astype(self.dtype)
         if self.scale != 1:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/deep_mlp/job.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+'''
+Launching
+
+jobman sqlschedules postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/mlp_dumi mlp_jobman.experiment mlp_jobman.conf
+'n_hidden={{500,1000,2000}}'
+'n_hidden_layers={{2,3}}'
+'train_on={{NIST,NISTP,P07}}'
+'train_subset={{DIGITS_ONLY,ALL}}'
+'learning_rate_log10={{-1.,-2.,-3.}}'
+
+in mlp_jobman.conf:
+rng_seed=1234
+L1_reg=0.0
+L2_reg=0.0
+n_epochs=10
+minibatch_size=20
+'''
+
+import os, sys, copy, operator, time
+import theano
+import theano.tensor as T
+import numpy
+from mlp import MLP
+from ift6266 import datasets
+from pylearn.io.seriestables import *
+import tables
+from jobman.tools import DD
+
+N_INPUTS = 32*32
+REDUCE_EVERY = 250
+
+TEST_RUN = False
+
+TEST_HP = DD({'n_hidden':200,
+            'n_hidden_layers': 2,
+            'train_on':'NIST',
+            'train_subset':'ALL',
+            'learning_rate_log10':-2,
+            'rng_seed':1234,
+            'L1_reg':0.0,
+            'L2_reg':0.0,
+            'n_epochs':2,
+            'minibatch_size':20})
+
+###########################################
+# digits datasets
+# nist_digits is already in NIST_PATH and in ift6266.datasets
+# NOTE: for these datasets the test and valid sets are wrong
+#   (don't correspond to the training set... they're just placeholders)
+
+from ift6266.datasets.defs import NIST_PATH, DATA_PATH
+TRANSFORMED_DIGITS_PATH = '/data/lisatmp/ift6266h10/data/transformed_digits'
+
+P07_digits = FTDataSet(\
+                     train_data = [os.path.join(TRANSFORMED_DIGITS_PATH,\
+                                     'data/P07_train'+str(i)+'_data.ft')\
+                                        for i in range(0, 100)],
+                     train_lbl = [os.path.join(TRANSFORMED_DIGITS_PATH,\
+                                     'data/P07_train'+str(i)+'_labels.ft')\
+                                        for i in range(0,100)],
+                     test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')],
+                     test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')],
+                     valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')],
+                     valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')],
+                     indtype=theano.config.floatX, inscale=255., maxsize=None)
+             
+#Added PNIST
+PNIST07_digits = FTDataSet(train_data = [os.path.join(TRANSFORMED_DIGITS_PATH,\
+                                            'PNIST07_train'+str(i)+'_data.ft')\
+                                                for i in range(0,100)],
+                     train_lbl = [os.path.join(TRANSFORMED_DIGITS_PATH,\
+                                            'PNIST07_train'+str(i)+'_labels.ft')\
+                                                for i in range(0,100)],
+                     test_data = [os.path.join(DATA_PATH,'data/PNIST07_test_data.ft')],
+                     test_lbl = [os.path.join(DATA_PATH,'data/PNIST07_test_labels.ft')],
+                     valid_data = [os.path.join(DATA_PATH,'data/PNIST07_valid_data.ft')],
+                     valid_lbl = [os.path.join(DATA_PATH,'data/PNIST07_valid_labels.ft')],
+                     indtype=theano.config.floatX, inscale=255., maxsize=None)
+
+
+# building valid_test_datasets
+# - on veut des dataset_obj pour les 3 datasets
+#       - donc juste à bâtir FTDataset(train=nimportequoi, test, valid=pNIST etc.)
+# - on veut dans l'array mettre des pointeurs vers la fonction either test ou valid
+#        donc PAS dataset_obj, mais dataset_obj.train (sans les parenthèses)
+def build_test_valid_sets():
+    nist_ds = datasets.nist_all()
+    pnist_ds = datasets.PNIST07()
+    p07_ds = datasets.nist_P07()
+
+    test_valid_fns = [nist_ds.test, nist_ds.valid,
+                    pnist_ds.test, pnist_ds.valid,
+                    p07_ds.test, p07_ds.valid]
+
+    test_valid_names = ["nist_all__test", "nist_all__valid",
+                        "NISTP__test", "NISTP__valid",
+                        "P07__test", "P07__valid"]
+
+    return test_valid_fns, test_valid_names
+
+def add_error_series(series, error_name, hdf5_file,
+                    index_names=('minibatch_idx',), use_accumulator=False,
+                    reduce_every=250):
+    # train
+    series_base = ErrorSeries(error_name=error_name,
+                    table_name=error_name,
+                    hdf5_file=hdf5_file,
+                    index_names=index_names)
+
+    if use_accumulator:
+        series[error_name] = \
+                    AccumulatorSeriesWrapper(base_series=series_base,
+                        reduce_every=reduce_every)
+    else:
+        series[error_name] = series_base
+
+TEST_VALID_FNS,TEST_VALID_NAMES = None, None
+def compute_and_save_errors(state, mlp, series, hdf5_file, minibatch_idx):
+    global TEST_VALID_FNS,TEST_VALID_NAMES
+
+    TEST_VALID_FNS,TEST_VALID_NAMES = build_test_valid_sets()
+
+    # if the training is on digits only, then there'll be a 100%
+    # error on digits in the valid/test set... just ignore them
+    
+    test_fn = theano.function([mlp.input], mlp.logRegressionLayer.y_pred)
+
+    test_batch_size = 100
+    for test_ds_fn,test_ds_name in zip(TEST_VALID_FNS,TEST_VALID_NAMES):
+        # reset error counts for every test/valid set
+        # note: float
+        total_errors = total_digit_errors = \
+                total_uppercase_errors = total_lowercase_errors = 0.
+
+        total_all = total_lowercase = total_uppercase = total_digit = 0
+
+        for mb_x,mb_y in test_ds_fn(test_batch_size):
+            digit_mask = mb_y < 10
+            uppercase_mask = mb_y >= 36
+            lowercase_mask = numpy.ones((len(mb_x),)) \
+                                    - digit_mask - uppercase_mask
+
+            total_all += len(mb_x)
+            total_digit += sum(digit_mask)
+            total_uppercase += sum(uppercase_mask)
+            total_lowercase += sum(lowercase_mask)
+
+            predictions = test_fn(mb_x)
+
+            all_errors = (mb_y != predictions)
+            total_errors += sum(all_errors)
+
+            if len(all_errors) != len(digit_mask):
+                print "size all", all_errors.shape, " digit", digit_mask.shape
+            total_digit_errors += sum(numpy.multiply(all_errors, digit_mask))
+            total_uppercase_errors += sum(numpy.multiply(all_errors, uppercase_mask))
+            total_lowercase_errors += sum(numpy.multiply(all_errors, lowercase_mask))
+
+        four_errors = [float(total_errors) / total_all,
+                        float(total_digit_errors) / total_digit, 
+                        float(total_lowercase_errors) / total_lowercase, 
+                        float(total_uppercase_errors) / total_uppercase]
+
+        four_errors_names = ["all", "digits", "lower", "upper"]
+
+        # record stats per set
+        print "Errors on", test_ds_name, ",".join(four_errors_names),\
+                ":", ",".join([str(e) for e in four_errors])
+
+        # now in the state
+        for err, errname in zip(four_errors, four_errors_names):
+            error_full_name = 'error__'+test_ds_name+'_'+errname
+            min_name = 'min_'+error_full_name
+            minpos_name = 'minpos_'+error_full_name
+
+            if state.has_key(min_name):
+                if state[min_name] > err:
+                    state[min_name] = err
+                    state[minpos_name] = pos_str
+            else:
+                # also create the series
+                add_error_series(series, error_full_name, hdf5_file,
+                            index_names=('minibatch_idx',))
+                state[min_name] = err
+                state[minpos_name] = minibatch_idx 
+
+            state[minpos_name] = pos_str
+            series[error_full_name].append((minibatch_idx,), err)
+
+def jobman_entrypoint(state, channel):
+    global TEST_RUN
+    minibatch_size = state.minibatch_size
+
+    print_every = 100000
+    COMPUTE_ERROR_EVERY = 10**7 / minibatch_size # compute error every 10 million examples
+    if TEST_RUN:
+        print_every = 100
+        COMPUTE_ERROR_EVERY = 1000 / minibatch_size
+
+    print "entrypoint, state is"
+    print state
+
+    ######################
+    # select dataset and dataset subset, plus adjust epoch num to make number
+    # of examples seen independent of dataset
+    # exemple: pour le cas DIGITS_ONLY, il faut changer le nombre d'époques
+    # et pour le cas NIST pur (pas de transformations), il faut multiplier par 100
+    # en partant car on a pas les variations
+
+    # compute this in terms of the P07 dataset size (=80M)
+    MINIBATCHES_TO_SEE = state.n_epochs * 8 * (10**6) / minibatch_size
+
+    if state.train_on == 'NIST' and state.train_subset == 'ALL':
+        dataset_obj = datasets.nist_all()
+    elif state.train_on == 'NIST' and state.train_subset == 'DIGITS_ONLY':
+        dataset_obj = datasets.nist_digits()
+    elif state.train_on == 'NISTP' and state.train_subset == 'ALL':
+        dataset_obj = datasets.PNIST07()
+    elif state.train_on == 'NISTP' and state.train_subset == 'DIGITS_ONLY':
+        dataset_obj = PNIST07_digits
+    elif state.train_on == 'P07' and state.train_subset == 'ALL':
+        dataset_obj = datasets.nist_P07()
+    elif state.train_on == 'P07' and state.train_subset == 'DIGITS_ONLY':
+        dataset_obj = datasets.P07_digits
+
+    dataset = dataset_obj
+    
+    if state.train_subset == 'ALL':
+        n_classes = 62
+    elif state.train_subset == 'DIGITS_ONLY':
+        n_classes = 10
+    else:
+        raise NotImplementedError()
+
+    ###############################
+    # construct model
+
+    print "constructing model..."
+    x     = T.matrix('x')
+    y     = T.ivector('y')
+
+    rng = numpy.random.RandomState(state.rng_seed)
+
+    # construct the MLP class
+    model = MLP(rng = rng, input=x, n_in=N_INPUTS,
+                        n_hidden_layers = state.n_hidden_layers,
+                        n_hidden = state.n_hidden, n_out=n_classes)
+
+
+    # cost and training fn
+    cost = T.mean(model.negative_log_likelihood(y)) \
+                 + state.L1_reg * model.L1 \
+                 + state.L2_reg * model.L2_sqr 
+
+    print "L1, L2: ", state.L1_reg, state.L2_reg
+
+    gradient_nll_wrt_params = []
+    for param in model.params:
+        gparam = T.grad(cost, param)
+        gradient_nll_wrt_params.append(gparam)
+
+    learning_rate = 10**float(state.learning_rate_log10)
+    print "Learning rate", learning_rate
+
+    train_updates = {}
+    for param, gparam in zip(model.params, gradient_nll_wrt_params):
+        train_updates[param] = param - learning_rate * gparam
+
+    train_fn = theano.function([x,y], cost, updates=train_updates)
+
+    #######################
+    # create series
+    basedir = os.getcwd()
+
+    h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
+
+    series = {}
+    add_error_series(series, "training_error", h5f,
+                    index_names=('minibatch_idx',), use_accumulator=True,
+                    reduce_every=REDUCE_EVERY)
+
+    ##########################
+    # training loop
+
+    start_time = time.clock()
+
+    print "begin training..."
+    print "will train for", MINIBATCHES_TO_SEE, "examples"
+
+    mb_idx = 0
+
+    while(mb_idx*minibatch_size<nb_max_exemples):
+
+        last_costs = []
+
+        for mb_x, mb_y in dataset.train(minibatch_size):
+            if TEST_RUN and mb_idx > 1000:
+                break
+                
+            last_cost = train_fn(mb_x, mb_y)
+            series["training_error"].append((mb_idx,), last_cost)
+
+            last_costs.append(last_cost)
+            if (len(last_costs)+1) % print_every == 0:
+                print "Mean over last", print_every, "minibatches: ", numpy.mean(last_costs)
+                last_costs = []
+
+            if (mb_idx+1) % COMPUTE_ERROR_EVERY == 0:
+                # compute errors
+                print "computing errors on all datasets..."
+                print "Time since training began: ", (time.clock()-start_time)/60., "minutes"
+                compute_and_save_errors(state, model, series, h5f, mb_idx)
+
+        channel.save()
+
+        sys.stdout.flush()
+
+    end_time = time.clock()
+
+    print "-"*80
+    print "Finished. Training took", (end_time-start_time)/60., "minutes"
+    print state
+
+def run_test():
+    global TEST_RUN
+    from fsml.job_management import mock_channel
+    TEST_RUN = True
+    jobman_entrypoint(TEST_HP, mock_channel)
+
+if __name__ == '__main__':
+    run_test()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/deep_mlp/logistic_sgd.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,223 @@
+import numpy, time, cPickle, gzip, sys, os
+
+import theano
+import theano.tensor as T
+
+class LogisticRegression(object):
+    def __init__(self, input, n_in, n_out):
+        self.W = theano.shared(value=numpy.zeros((n_in,n_out),
+                                dtype = theano.config.floatX),
+                                name='W')
+        self.b = theano.shared(value=numpy.zeros((n_out,),
+                                dtype = theano.config.floatX),
+                               name='b')
+
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
+
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+        self.params = [self.W, self.b]
+
+    def negative_log_likelihood(self, y):
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+
+    def errors(self, y):
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+
+        if y.dtype.startswith('int'):
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+    print '... loading data'
+
+    # Load the dataset 
+    f = gzip.open(dataset,'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+
+    def shared_dataset(data_xy):
+        """ Function that loads the dataset into shared variables
+        
+        The reason we store our dataset in shared variables is to allow 
+        Theano to copy it into the GPU memory (when code is run on GPU). 
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared 
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are 
+        # floats it doesn't make sense) therefore instead of returning 
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x,  test_set_y  = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
+    return rval
+
+
+
+
+def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='../data/mnist.pkl.gz',
+        batch_size = 600):
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print '... building the model'
+
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
+
+    # construct the logistic regression class
+    # Each MNIST image has size 28*28
+    classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model in symbolic format
+    cost = classifier.negative_log_likelihood(y) 
+
+    # compiling a Theano function that computes the mistakes that are made by 
+    # the model on a minibatch
+    test_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:test_set_x[index*batch_size:(index+1)*batch_size],
+                y:test_set_y[index*batch_size:(index+1)*batch_size]})
+
+    validate_model = theano.function( inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:valid_set_x[index*batch_size:(index+1)*batch_size],
+                y:valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+    # compute the gradient of cost with respect to theta = (W,b) 
+    g_W = T.grad(cost = cost, wrt = classifier.W)
+    g_b = T.grad(cost = cost, wrt = classifier.b)
+
+    # specify how to update the parameters of the model as a dictionary
+    updates ={classifier.W: classifier.W - learning_rate*g_W,\
+              classifier.b: classifier.b - learning_rate*g_b}
+
+    # compiling a Theano function `train_model` that returns the cost, but in 
+    # the same time updates the parameter of the model based on the rules 
+    # defined in `updates`
+    train_model = theano.function(inputs = [index], 
+            outputs = cost, 
+            updates = updates,
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training the model'
+    # early-stopping parameters
+    patience              = 5000  # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min(n_train_batches, patience/2)  
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    test_score           = 0.
+    start_time = time.clock()
+
+    done_looping = False 
+    epoch = 0  
+    while (epoch < n_epochs) and (not done_looping):
+        epoch = epoch + 1
+        for minibatch_index in xrange(n_train_batches):
+
+            minibatch_avg_cost = train_model(minibatch_index)
+            # iteration number
+            iter = epoch * n_train_batches + minibatch_index
+
+            if (iter+1) % validation_frequency == 0: 
+                # compute zero-one loss on validation set 
+                validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+                this_validation_loss = numpy.mean(validation_losses)
+
+                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                    (epoch, minibatch_index+1,n_train_batches, \
+                    this_validation_loss*100.))
+
+
+                # if we got the best validation score until now
+                if this_validation_loss < best_validation_loss:
+                    #improve patience if loss improvement is good enough
+                    if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                        patience = max(patience, iter * patience_increase)
+
+                    best_validation_loss = this_validation_loss
+                    # test it on the test set
+
+                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                    test_score  = numpy.mean(test_losses)
+
+                    print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                       'model %f %%') % \
+                        (epoch, minibatch_index+1, n_train_batches,test_score*100.))
+
+            if patience <= iter :
+                done_looping = True
+                break
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 (best_validation_loss * 100., test_score*100.))
+    print 'The code run for %d epochs, with %f epochs/sec'%(epoch,1.*epoch/(end_time-start_time))
+    print >> sys.stderr, ('The code for file '+os.path.split(__file__)[1]+' ran for %.1fs' % ((end_time-start_time)))
+
+if __name__ == '__main__':
+    sgd_optimization_mnist()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/deep_mlp/mlp.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,210 @@
+__docformat__ = 'restructedtext en'
+
+import numpy, time, cPickle, gzip, sys, os
+
+import theano
+import theano.tensor as T
+
+from logistic_sgd import LogisticRegression, load_data
+
+class HiddenLayer(object):
+    def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
+        print "Creating HiddenLayer with params"
+        print locals()
+
+        self.input = input
+
+        W_values = numpy.asarray( rng.uniform(
+                low  = - numpy.sqrt(6./(n_in+n_out)),
+                high = numpy.sqrt(6./(n_in+n_out)),
+                size = (n_in, n_out)), dtype = theano.config.floatX)
+        if activation == theano.tensor.nnet.sigmoid:
+            W_values *= 4
+
+        self.W = theano.shared(value = W_values, name ='W')
+
+        b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
+        self.b = theano.shared(value= b_values, name ='b')
+
+        self.output = activation(T.dot(input, self.W) + self.b)
+
+        self.params = [self.W, self.b]
+
+
+class MLP(object):
+    def __init__(self, rng, input, n_in, n_hidden_layers, n_hidden, n_out):
+        print "Creating MLP with params"
+        print locals()
+
+        self.input = input
+
+        self.hiddenLayers = []
+
+        last_input = input
+        last_n_out = n_in
+        for i in range(n_hidden_layers):
+            self.hiddenLayers.append(\
+                    HiddenLayer(rng = rng, input = last_input, 
+                                             n_in = last_n_out,
+                                             n_out = n_hidden,
+                                             activation = T.tanh))
+            last_input = self.hiddenLayers[-1].output
+            last_n_out = n_hidden
+
+        self.logRegressionLayer = LogisticRegression( 
+                                    input = self.hiddenLayers[-1].output,
+                                    n_in  = n_hidden,
+                                    n_out = n_out)
+
+        self.L1 = abs(self.logRegressionLayer.W).sum()
+        for h in self.hiddenLayers:
+            self.L1 += abs(h.W).sum()
+
+        self.L2_sqr = (self.logRegressionLayer.W**2).sum()
+        for h in self.hiddenLayers:
+            self.L2_sqr += (h.W**2).sum()
+
+        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
+
+        self.errors = self.logRegressionLayer.errors
+
+        self.params = []
+        for hl in self.hiddenLayers:
+            self.params += hl.params
+        self.params += self.logRegressionLayer.params
+
+
+def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
+            dataset = '../data/mnist.pkl.gz', batch_size = 20):
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
+
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ###################### 
+    print '... building the model'
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
+
+    rng = numpy.random.RandomState(1234)
+
+    # construct the MLP class
+    classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model plus the regularization terms (L1 and L2); cost is expressed
+    # here symbolically
+    cost = classifier.negative_log_likelihood(y) \
+         + L1_reg * classifier.L1 \
+         + L2_reg * classifier.L2_sqr 
+
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:test_set_x[index*batch_size:(index+1)*batch_size],
+                y:test_set_y[index*batch_size:(index+1)*batch_size]})
+
+    validate_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:valid_set_x[index*batch_size:(index+1)*batch_size],
+                y:valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    gparams = []
+    for param in classifier.params:
+        gparam  = T.grad(cost, param)
+        gparams.append(gparam)
+
+
+    # specify how to update the parameters of the model as a dictionary
+    updates = {}
+    # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of 
+    # same length, zip generates a list C of same size, where each element
+    # is a pair formed from the two lists : 
+    #    C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ] 
+    for param, gparam in zip(classifier.params, gparams):
+        updates[param] = param - learning_rate*gparam
+
+    # compiling a Theano function `train_model` that returns the cost, but  
+    # in the same time updates the parameter of the model based on the rules 
+    # defined in `updates`
+    train_model =theano.function( inputs = [index], outputs = cost, 
+            updates = updates,
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training'
+
+    # early-stopping parameters
+    patience              = 10000 # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min(n_train_batches,patience/2)  
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    best_iter            = 0
+    test_score           = 0.
+    start_time = time.clock()
+
+    epoch = 0
+    done_looping = False
+
+    while (epoch < n_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
+
+        minibatch_avg_cost = train_model(minibatch_index)
+        # iteration number
+        iter = epoch * n_train_batches + minibatch_index
+
+        if (iter+1) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
+
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                 (epoch, minibatch_index+1,n_train_batches, \
+                  this_validation_loss*100.))
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                best_validation_loss = this_validation_loss
+                # test it on the test set
+
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score = numpy.mean(test_losses)
+
+          
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/aistats_review/m_mlp_ift.py	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,121 @@
+import pdb,bricks.costs,datetime,os,theano,sys
+from bricks.experiments import *
+from bricks.networks import *
+from bricks import *
+from datasets import *
+from bricks.optimizer import *
+from monitor.exp_monitoring import *
+
+#from monitor.series import *
+import numpy
+#import jobman,jobman.sql,pylearn.version
+#from jobman import DD
+#from utils.JobmanHandling import *
+
+class MnistTSdaeExperiment(ExperimentObject):
+    # Todo : Write down the interface
+
+    def _init_dataset(self):
+        self.dataset_list = [ PNIST07(), nist_all() ]
+        self.dataset = self.dataset_list[0]
+
+
+    def _init_outputs(self):
+        self.ds_output = { 'Pnist_Train' : self.dataset_list[0].train,
+                           'Pnist_Valid' : self.dataset_list[0].valid,
+                           'Pnist_Test' : self.dataset_list[0].test,
+                           'nist_Train' : self.dataset_list[1].train,
+                           'nist_Valid' : self.dataset_list[1].valid,
+                           'nist_Test' : self.dataset_list[1].test}
+
+        self.outputs = { 'CC' : costs.classification_error(self.network.layers[-1][0].out_dict['argmax_softmax_output'],self.network.in_dict['pred']) }
+                         #'L1' : costs.L1(self.network.layers[0][0].out_dict['sigmoid_output']) }
+                         #'LL' : costs.negative_ll(self.network.layers[-1][0].out_dict['softmax_output'],self.network.in_dict['pred']) }
+
+
+
+    def _init_network(self):
+        """ Choose wich network to initialize """
+        #x,y = self.dataset.train(1).next()
+        n_i = 1024
+        n_o = 62
+        numpy.random.seed(self.hp['seed'])
+        self.network = MLPNetwork(n_i,n_o,size=self.hp['size'])
+        default.load_pickled_network(self.network,'best_params/1/')
+
+    def _init_costs_params(self):
+        #finetuning
+        self.costs  = [ [costs.negative_ll(self.network.layers[-1][0].out_dict['softmax_output'],self.network.in_dict['pred'])] ]
+        self.params = [ [self.network.get_all_params(),self.network.get_all_params()] ]
+
+
+    def _init_monitor(self):
+        self.monitor = monitor(self.outputs,self.ds_output,self.network,self.sub_paths,save_criterion='Pnist_Valid')
+
+    def startexp(self):
+        print self.info()
+        for j,optimizer in enumerate(self.optimizers):
+            print 'Optim', '#'+str(j+1)
+            sys.stdout.flush()
+            for i in range(self.hp['ft_ep']):
+                optimizer.tune(self.dataset.train,self.hp['bs'])
+                print repr(i).rjust(3),self.monitor.get_str_output()
+                sys.stdout.flush()
+
+
+    def run(self):
+        self.startexp()
+        self.monitor.dump()
+        return True
+
+def jobman_entrypoint(state, channel):
+    import jobman,jobman.sql,pylearn.version
+    from jobman import DD
+    from utils.JobmanHandling import JobHandling,jobman_insert,cartesian_product_jobs
+    exp = MnistTSdaeExperiment(state,channel)
+    return exp.jobhandler.start(state,channel)
+
+def standalone(state):
+    exp = MnistTSdaeExperiment(state)
+    exp.run()
+
+
+if __name__ == '__main__':
+    HP = { 'lr':[ [ .1] ],
+           'ft_ep':[100],
+           'bs':[100],
+           'size':[ [300],[4000],[5000],[6000],[7000] ],
+           'seed':[0]}
+
+    job_db_path = 'postgres://mullerx:b9f6ed1ee4@gershwin/mullerx_db/m_mlp_ift'
+    exp_path = "m_mlp_ift.jobman_entrypoint"
+
+    args = sys.argv[1:]
+
+    if len(args) > 0 and args[0] == 'jobman_insert':
+        jobman_insert(HP,job_db_path,exp_path)
+
+    elif len(args) > 0 and args[0] == 'jobman_test':
+        chanmock = DD({'COMPLETE':0,'save':(lambda:None)})
+        dd_hp = cartesian_product_jobs(HP)
+        print dd_hp[0]
+        jobman_entrypoint(dd_hp[0], chanmock)
+
+    elif len(args) > 0 and args[0] == 'standalone':
+        hp = { 'lr':[ .1],
+           'ft_ep':100,  
+           'bs':100,
+           'size':[ 3000 ],
+           'seed':0}
+        standalone(hp)
+        
+        
+    else:
+        print "Bad arguments"
+
+
+#jobman sqlview  postgres://mullerx:b9f6ed1ee4@gershwin/mullerx_db/m_mlp_ift m_mlp_ift_view
+#psql -h gershwin -U mullerx -d mullerx_db
+#b9f6ed1ee4
+
+#jobdispatch --condor  --env=THEANO_FLAGS=floatX=float32 --repeat_jobs=5 jobman sql -n0 'postgres://mullerx:b9f6ed1ee4@gershwin/mullerx_db/m_mlp_ift' .
--- a/writeup/Makefile	Sat Mar 19 22:49:33 2011 -0400
+++ b/writeup/Makefile	Sat Mar 19 22:51:40 2011 -0400
@@ -1,8 +1,12 @@
-all: nips2010_submission.pdf
+all: aistats2011_cameraready.pdf
+#all: nips2010_submission.pdf
 
-nips2010_submission.pdf: nips2010_submission.tex images/*
-	pdflatex nips2010_submission.tex
-	bibtex -min-crossrefs=999 nips2010_submission
-	pdflatex nips2010_submission.tex
-	pdflatex nips2010_submission.tex
+%.pdf: %.tex images/*
+	pdflatex $<
+	bibtex -min-crossrefs=999 ${<:.tex=}
+	pdflatex $<
+	pdflatex $<
+	pdflatex $<
 
+clean:
+	rm -f *.aux *.bbl *.blg *.log
\ No newline at end of file
Binary file writeup/NIPS2010_workshop_spotlight.pdf has changed
Binary file writeup/NIPS2010_workshop_spotlight.ppt has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/ReviewsAISTATS.html	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,297 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<!-- saved from url=(0096)https://cmt.research.microsoft.com/AIS2011/Protected/Author/ViewReviewsForPaper.aspx?paperId=126 -->
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>
+	Reviews For Paper
+</title>
+<style>
+#header
+{
+    width: 100%;
+    font-size: small;
+    background-color:#F7F7F7;
+}
+.printThemeText
+{
+    font-size:small;
+}
+.printThemeTable td
+{
+    vertical-align:top;
+}
+.printThemeGrid th
+{
+    color:white;
+    background:#5D7B9D;
+    font-weight:bold;
+}
+.printThemeGrid
+{
+    border-collapse:collapse;
+}
+.printThemeGrid td, .printThemeGrid th
+{
+    border:solid 1px #D6D3CE;
+    padding:4px 4px 4px 4px;
+}
+.printThemeGrid .row
+{ 
+    background-color:#F7F6F3;
+    color:#333333;
+    vertical-align:top;
+}
+.printThemeGrid .altrow
+{ 
+    background-color:White;
+    color:#284775;
+    vertical-align:top;
+}
+.cellprompt
+{
+	font-weight:bold;
+	white-space:nowrap;
+    width:100px;	
+}
+.paperHeader
+{
+    background-color:#dee3e7;
+    margin:5px 5px 15px 0px;
+    width:99%;
+    font-family:Verdana;
+    font-size:medium;
+    font-weight:bold;
+}
+.sectionHeader
+{
+    background-color:#dee3e7;
+    padding:5px 5px 5px 0px;
+    width:99%;
+    text-decoration:underline;
+    font-family:Verdana;
+    font-size:small;
+    font-weight:bold;
+}
+.underlineheader
+{
+    text-decoration:underline;
+    font-weight:bold;
+    padding:5px 0px;
+}
+.response
+{
+    padding:5px 0px;
+}
+.reviewerlabel
+{
+    padding-right:20px;
+}
+.pageTitle
+{
+    background-color:#dee3e7;
+    padding:5px 5px 5px 5px;
+    margin-top:10px;
+    width:99%;
+    font-family:Verdana;
+    font-size:medium;
+    font-weight:bold;
+}
+.submissionDetailsView
+{
+}
+.submissionDetailsView tr
+{
+    vertical-align:top;
+}
+.submissionDetailsView td.prompt
+{
+    font-weight:bold;
+}
+.submissionDetailsView tr.sectionSeparator
+{
+
+}
+.submissionDetailsView tr.sectionSeparator td
+{
+    background-color:#dee3e7;
+    padding:5px 5px 5px 5px;
+    font-family:Verdana;
+    font-size:small;
+    font-weight:bold;
+    color:Navy;
+}
+</style>
+</head>
+<body>
+<form name="aspnetForm" method="post" action="./ReviewsAISTATS_files/ReviewsAISTATS.html" id="aspnetForm">
+<div>
+<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKMTAxNDM4ODU3Ng9kFgJmD2QWAgIDD2QWAmYPZBYCAgUPDxYCHgdWaXNpYmxlZ2QWBgIBD2QWAmYPZBYEAgMPDxYCHgRUZXh0BQMxMjZkZAIFDw8WAh8BBTxEZWVwIExlYXJuZXJzIEJlbmVmaXQgTW9yZSBmcm9tIE91dC1vZi1EaXN0cmlidXRpb24gRXhhbXBsZXNkZAIDDw8WAh8AaGRkAgUPFgIeC18hSXRlbUNvdW50AgIWBGYPZBYGAgMPZBYCZg8VARNBc3NpZ25lZF9SZXZpZXdlcl8yZAIHDzwrAA0BAA8WBB4LXyFEYXRhQm91bmRnHwICB2QWAmYPZBYQAgEPZBYEZg8PFgIfAQWBAk92ZXJhbGwgcmF0aW5nOiBwbGVhc2Ugc3ludGhlc2l6ZSB5b3VyIGFuc3dlcnMgdG8gb3RoZXIgcXVlc3Rpb25zIGludG8gYW4gb3ZlcmFsbCByZWNvbW1lbmRhdGlvbi4gIFBsZWFzZSB0YWtlIGludG8gYWNjb3VudCB0cmFkZW9mZnMgKGFuIGluY3JlYXNlIGluIG9uZSBtZWFzdXJlIG1heSBjb21wZW5zYXRlIGZvciBhIGRlY3JlYXNlIGluIGFub3RoZXIpLCBhbmQgZGVzY3JpYmUgdGhlIHRyYWRlb2ZmcyBpbiB0aGUgZGV0YWlsZWQgY29tbWVudHMuZGQCAQ9kFgJmDxUBFEdvb2Q6IHN1Z2dlc3QgYWNjZXB0ZAICD2QWBGYPDxYCHwEFTFRlY2huaWNhbCBxdWFsaXR5OiBpcyBhbGwgaW5jbHVkZWQgbWF0ZXJpYWwgcHJlc2VudGVkIGNsZWFybHkgYW5kIGNvcnJlY3RseT9kZAIBD2QWAmYPFQEER29vZGQCAw9kFgRmDw8WAh8BBWdPcmlnaW5hbGl0eTogaG93IG11Y2ggbmV3IHdvcmsgaXMgcmVwcmVzZW50ZWQgaW4gdGhpcyBwYXBlciwgYmV5b25kIHByZXZpb3VzIGNvbmZlcmVuY2Uvam91cm5hbCBwYXBlcnM/ZGQCAQ9kFgJmDxUBGFN1YnN0YW50aWFsIG5ldyBtYXRlcmlhbGQCBA9kFgRmDw8WAh8BBYMBSW50ZXJlc3QgYW5kIHNpZ25pZmljYW5jZTogd291bGQgdGhlIHBhcGVyJ3MgZ29hbCwgaWYgY29tcGxldGVseSBzb2x2ZWQsIHJlcHJlc2VudCBhIHN1YnN0YW50aWFsIGFkdmFuY2UgZm9yIHRoZSBBSVNUQVRTIGNvbW11bml0eT9kZAIBD2QWAmYPFQELU2lnbmlmaWNhbnRkAgUPZBYEZg8PFgIfAQV1VGhvcm91Z2huZXNzOiB0byB3aGF0IGRlZ3JlZSBkb2VzIHRoZSBwYXBlciBzdXBwb3J0IGl0cyBjb25jbHVzaW9ucyB0aHJvdWdoIGV4cGVyaW1lbnRhbCBjb21wYXJpc29ucywgdGhlb3JlbXMsIGV0Yy4/ZGQCAQ9kFgJmDxUBCFRob3JvdWdoZAIGD2QWBGYPDxYCHwEFfUNyZWF0aXZpdHk6IHRvIHdoYXQgZGVncmVlIGRvZXMgdGhlIHBhcGVyIHJlcHJlc2VudCBhIG5vdmVsIHdheSBvZiBzZXR0aW5nIHVwIGEgcHJvYmxlbSBvciBhbiB1bnVzdWFsIGFwcHJvYWNoIHRvIHNvbHZpbmcgaXQ/ZGQCAQ9kFgJmDxUBMk1vc3QgY29udGVudCByZXByZXNlbnRzIGFwcGxpY2F0aW9uIG9mIGtub3duIGlkZWFzZAIHD2QWBGYPDxYCHwEFEURldGFpbGVkIENvbW1lbnRzZGQCAQ9kFgJmDxUBighUaGlzIHBhcGVyIHNob3dzIHRoYXQgZGVlcCBuZXR3b3JrcyBiZW5lZml0IG1vcmUgZnJvbSBvdXQtb2YtZGlzdHJpYnV0aW9uIGV4YW1wbGVzIHRoYW4gc2hhbGxvd2VyIGFyY2hpdGVjdHVyZXMgb24gYSBsYXJnZSBzY2FsZSBjaGFyYWN0ZXIgcmVjb2duaXRpb24gZXhwZXJpbWVudC4gQSB0aG9yb3VnaCBlbXBpcmljYWwgdmFsaWRhdGlvbiBzaG93cyB0aGF0IGRlZXAgbmV0cyBwcm9kdWNlIGJldHRlciBkaXNjcmltaW5hdGlvbiAodGhhbiBzaGFsbG93ZXIgbmV0cykgd2hlbiB0cmFpbmVkIHdpdGggZGlzdG9ydGVkIGNoYXJhY3RlcnMgYW5kIHdoZW4gdHJhaW5lZCBvbiBtdWx0aXBsZSB0YXNrcy4gDTxiciAvPkFsdGhvdWdoIHRoZSBtZXRob2RzIHVzZWQgYXJlIGFscmVhZHkgd2VsbCBlc3RhYmxpc2hlZCBpbiB0aGUgY29tbXVuaXR5LCB0aGVzZSByZXN1bHRzIGFyZSBzaWduaWZpY2FudCBhbmQgcHJvdmlkZSBuZXcgaW5zaWdodHMgb24gdGhlIHJlcHJlc2VudGF0aW9uYWwgcG93ZXIgb2YgdGhpcyBjbGFzcyBvZiBtZXRob2RzLg08YnIgLz4NPGJyIC8+U3VnZ2VzdGlvbnM6DTxiciAvPi0gaXQgd291bGQgYmUgaW50ZXJlc3RpbmcgdG8gY29tcGFyZSB0aGUgZGVlcCBhcmNoaXRlY3R1cmUgYW5kIHRoZSBzaGFsbG93IGFyY2hpdGVjdHVyZSBmb3IgYSBnaXZlbiBjYXBhY2l0eSBvZiB0aGUgbW9kZWwgKGkuZS4gdXNlIHdpZGVyIHNoYWxsb3cgbmV0KQ08YnIgLz4tIHNpbmNlIHRoZSBhdXRob3JzIHVzZSBkZW5vaXNpbmcgYXV0b2VuY29kZXJzIHRvIHByZS10cmFpbiBkZWVwIG5ldHdvcmtzLCB0aGV5IGNvdWxkIGNvbnNpZGVyIHRvIHVzZSBkaXN0b3J0ZWQgY2hhcmFjdGVycyBhcyBub2lzeSBpbnB1dHMgaW5zdGVhZCBvZiBhcnRpZmljaWFsbHkgc2V0IHRvIDAgc29tZSBpbnB1dHMuIFRoaXMgbWlnaHQgaGVscCBsZWFybmluZyBtb3JlIHJlcHJlc2VudGF0aW9ucyB0aGF0IGFyZSBtb3JlIHJvYnVzdCB0byBkaXN0b3J0aW9ucyB0aGF0IGFyZSBhY3R1YWxseSB1c2VmdWwgZm9yIGRpc2NyaW1pbmF0aW9uLmQCCA8PFgIfAGhkZAIIDxUBAGQCAQ9kFgYCAw9kFgJmDxUBE0Fzc2lnbmVkX1Jldmlld2VyXzNkAgcPPCsADQEADxYEHwNnHwICB2QWAmYPZBYQAgEPZBYEZg8PFgIfAQWBAk92ZXJhbGwgcmF0aW5nOiBwbGVhc2Ugc3ludGhlc2l6ZSB5b3VyIGFuc3dlcnMgdG8gb3RoZXIgcXVlc3Rpb25zIGludG8gYW4gb3ZlcmFsbCByZWNvbW1lbmRhdGlvbi4gIFBsZWFzZSB0YWtlIGludG8gYWNjb3VudCB0cmFkZW9mZnMgKGFuIGluY3JlYXNlIGluIG9uZSBtZWFzdXJlIG1heSBjb21wZW5zYXRlIGZvciBhIGRlY3JlYXNlIGluIGFub3RoZXIpLCBhbmQgZGVzY3JpYmUgdGhlIHRyYWRlb2ZmcyBpbiB0aGUgZGV0YWlsZWQgY29tbWVudHMuZGQCAQ9kFgJmDxUBGVZlcnkgZ29vZDogc3VnZ2VzdCBhY2NlcHRkAgIPZBYEZg8PFgIfAQVMVGVjaG5pY2FsIHF1YWxpdHk6IGlzIGFsbCBpbmNsdWRlZCBtYXRlcmlhbCBwcmVzZW50ZWQgY2xlYXJseSBhbmQgY29ycmVjdGx5P2RkAgEPZBYCZg8VAQlWZXJ5IGdvb2RkAgMPZBYEZg8PFgIfAQVnT3JpZ2luYWxpdHk6IGhvdyBtdWNoIG5ldyB3b3JrIGlzIHJlcHJlc2VudGVkIGluIHRoaXMgcGFwZXIsIGJleW9uZCBwcmV2aW91cyBjb25mZXJlbmNlL2pvdXJuYWwgcGFwZXJzP2RkAgEPZBYCZg8VARhTdWJzdGFudGlhbCBuZXcgbWF0ZXJpYWxkAgQPZBYEZg8PFgIfAQWDAUludGVyZXN0IGFuZCBzaWduaWZpY2FuY2U6IHdvdWxkIHRoZSBwYXBlcidzIGdvYWwsIGlmIGNvbXBsZXRlbHkgc29sdmVkLCByZXByZXNlbnQgYSBzdWJzdGFudGlhbCBhZHZhbmNlIGZvciB0aGUgQUlTVEFUUyBjb21tdW5pdHk/ZGQCAQ9kFgJmDxUBC1NpZ25pZmljYW50ZAIFD2QWBGYPDxYCHwEFdVRob3JvdWdobmVzczogdG8gd2hhdCBkZWdyZWUgZG9lcyB0aGUgcGFwZXIgc3VwcG9ydCBpdHMgY29uY2x1c2lvbnMgdGhyb3VnaCBleHBlcmltZW50YWwgY29tcGFyaXNvbnMsIHRoZW9yZW1zLCBldGMuP2RkAgEPZBYCZg8VAQhUaG9yb3VnaGQCBg9kFgRmDw8WAh8BBX1DcmVhdGl2aXR5OiB0byB3aGF0IGRlZ3JlZSBkb2VzIHRoZSBwYXBlciByZXByZXNlbnQgYSBub3ZlbCB3YXkgb2Ygc2V0dGluZyB1cCBhIHByb2JsZW0gb3IgYW4gdW51c3VhbCBhcHByb2FjaCB0byBzb2x2aW5nIGl0P2RkAgEPZBYCZg8VAShNb3N0IGNvbnRlbnQgcmVwcmVzZW50cyBub3ZlbCBhcHByb2FjaGVzZAIHD2QWBGYPDxYCHwEFEURldGFpbGVkIENvbW1lbnRzZGQCAQ9kFgJmDxUB3xVUaGlzIHBhcGVyIGNsYWltcyB0aGF0IHVzaW5nIG91dC1vZi1kaXN0cmlidXRpb24gZXhhbXBsZXMgY2FuIGJlIG1vcmUgaGVscGZ1bCBpbiB0cmFpbmluZyBkZWVwIGFyY2hpdGVjdHVyZXMgdGhhbiBzaGFsbG93IGFyY2hpdGVjdHVyZXMuIEluIG9yZGVyIHRvIHRlc3QgdGhpcyBoeXBvdGhlc2lzLCB0aGUgcGFwZXIgZGV2ZWxvcHMgZXh0ZW5zaXZlIHRyYW5zZm9ybWF0aW9ucyBmb3IgaW1hZ2UgcGF0Y2hlcyAoaS5lLiwgaW1hZ2VzIG9mIGhhbmR3cml0dGVuIGNoYXJhY3RlcnMpIHRvIGdlbmVyYXRlIGEgbGFyZ2Utc2NhbGUgZGF0YXNldCBvZiBwZXJ0dXJiZWQgaW1hZ2VzLiBUaGVzZSBvdXQtb2YtZGlzdHJpYnV0aW9uIGV4YW1wbGVzIGFyZSB0cmFpbmVkIHVzaW5nIE1MUHMgYW5kIHN0YWNrZWQgZGVub2lzaW5nIGF1dG8tZW5jb2RlcnMgKFNEQXMpLiBJbiB0aGUgZXhwZXJpbWVudHMsIHRoZSBwYXBlciBzaG93cyB0aGF0IFNEQXMgb3V0cGVyZm9ybSBNTFBzLCBhY2hpZXZpbmcgaHVtYW4tbGV2ZWwgcGVyZm9ybWFuY2UgZm9yIE5JU1QgZGF0YXNldC4gVGhlIHBhcGVyIGFsc28gcHJvdmlkZXMgdHdvIGludGVyZXN0aW5nIGV4cGVyaW1lbnRzIHNob3dpbmcgdGhhdDogKDEpIFNEQXMgY2FuIGJlbmVmaXQgZnJvbSB0cmFpbmluZyBwZXJ0dXJiZWQgZGF0YSwgZXZlbiB3aGVuIHRlc3Rpbmcgb24gY2xlYW4gZGF0YTsgKDIpIFNEQXMgY2FuIHNpZ25pZmljYW50bHkgYmVuZWZpdCBmcm9tIG11bHRpLXRhc2sgbGVhcm5pbmcuDTxiciAvPg08YnIgLz4NPGJyIC8+UXVlc3Rpb25zLCBjb21tZW50cywgYW5kIHN1Z2dlc3Rpb25zOg08YnIgLz4xLiBSZWdhcmRpbmcgdGhlIGh1bWFuIGxhYmVsaW5nLCBJIGhhdmUgc29tZSBjb25jZXJucyBhYm91dCBsYWJlbGluZyBub2lzZS9iaWFzZXMgZHVlIHRvIEFNVC4gSG93IHdlcmUgdGhlIGFub21hbGllcyBpbiBsYWJlbGluZyBvciBvdXRsaWVycyBjb250cm9sbGVkPyBXYXMgdGhlcmUgYW55IHByb2NlZHVyZSB0byBtaW5pbWl6ZSBsYWJlbGluZyBub2lzZS9iaWFzZXMgb3IgdG8gZW5zdXJlIHRoYXQgaHVtYW4gbGFiZWxlcnMgdHJpZWQgdGhlaXIgYmVzdCAoZS5nLiwgZmlsdGVyaW5nIG91dCByYW5kb20gZ3Vlc3NlcyBvciBlbmNvdXJhZ2luZyB0aGUgbGFiZWxlcnMgdG8gY29uc2lkZXIgYWxsIHBvc3NpYmlsaXRpZXMgY2FyZWZ1bGx5IGJlZm9yZSBwcm92aWRpbmcgcHJlbWF0dXJlIGd1ZXNzZXMpPyBGb3IgZXhhbXBsZSwgbXVsdGktc3RhZ2UgcXVlc3Rpb25uYWlyZXMgKGUuZy4sIGFza2luZyAiY2hhcmFjdGVycy9kaWdpdHMiLCAidXBwZXJjYXNlL2xvd2VyY2FzZSIsIHRoZW4gY2hvb3Npbmcgb25lIG91dCBvZiAxMCBkaWdpdHMsIG9yIDI2IGNoYXJhY3RlcnMpIG1pZ2h0IHNpZ25pZmljYW50bHkgcmVkdWNlIGxhYmVsaW5nIG5vaXNlL2JpYXNlcywgcmF0aGVyIHRoYW4gc2hvd2luZyA2MiBjYW5kaWRhdGUgYW5zd2VycyBzaW11bHRhbmVvdXNseS4NPGJyIC8+DTxiciAvPjIuIEl0IHNlZW1zIHRoYXQgdGhlIHBhcGVyIGZpeGVkIHRoZSBudW1iZXIgb2YgaGlkZGVuIGxheWVycyBhcyB0aHJlZS4gRGVzcGl0ZSBnb29kIHBlcmZvcm1hbmNlIG9mIHRoZSBwcm9wb3NlZCBhcmNoaXRlY3R1cmUsIGl0IGlzIHNvbWV3aGF0IHVuY2xlYXIgd2hldGhlciB0aGUgYmVuZWZpdCBjb21lcyBtYWlubHkgZnJvbSBkZWVwIGFyY2hpdGVjdHVyZSBvciB0aGUgdXNlIG9mIGRlbm9pc2luZyBhdXRvLWVuY29kZXJzLg08YnIgLz4NPGJyIC8+VGhlcmVmb3JlLCBpdCB3aWxsIGJlIG1vcmUgaW50ZXJlc3RpbmcgdG8gc2VlIHRoZSBlZmZlY3Qgb2YgdGhlIG51bWJlciBvZiBsYXllcnMgYW5kIG90aGVyIHByZS10cmFpbmluZyBtZXRob2RzIChlLmcuLCBSQk1zIG9yIGF1dG8tZW5jb2RlcnMpLiBUaGlzIGV4cGVyaW1lbnQgd2lsbCBjbGFyaWZ5IHdoZXJlIHRoZSBiZW5lZml0IGNvbWVzIGZyb20gKGkuZS4sIGRlZXAgYXJjaGl0ZWN0dXJlIHZzLiBwcmUtdHJhaW5pbmcgbW9kdWxlcykgYW5kIHByb3ZpZGUgbW9yZSBpbnNpZ2h0cyBhYm91dCB0aGUgcmVzdWx0cy4NPGJyIC8+DTxiciAvPjMuIFRoZSBwYXBlciBicmllZmx5IG1lbnRpb25lZCBhYm91dCB0aGUgdXNlIG9mIGxpYlNWTSwgYnV0IGl0IHdpbGwgYmUgdXNlZnVsIHRvIGNvbXBhcmUgYWdhaW5zdCB0aGUgcmVzdWx0cyB1c2luZyBvbmxpbmUgU1ZNIChlLmcuLCBQRUdBU09TKS4NPGJyIC8+DTxiciAvPjQuIFRoZSBwYXBlciBhbHNvIHRhbGtzIGFib3V0IHRoZSBlZmZlY3Qgb2YgbGFyZ2UgbGFiZWxlZCBkYXRhIGluIHNlbGYtdGF1Z2h0IGxlYXJuaW5nIHNldHRpbmcuIFRvIHN0cmVuZ3RoZW4gdGhlIGNsYWltLCBpdCB3aWxsIGJlIGhlbHBmdWwgdG8gc2hvdyB0aGUgdGVzdCBhY2N1cmFjeSBhcyBhIGZ1bmN0aW9uIG9mIG51bWJlciBvZiBsYWJlbGVkIGV4YW1wbGVzLg08YnIgLz4NPGJyIC8+T3ZlcmFsbCwgdGhlIHBhcGVyIGlzIGNsZWFybHkgd3JpdHRlbiwgYW5kIGl0IHByb3ZpZGVzIGludGVyZXN0aW5nIGV4cGVyaW1lbnRzIG9uIGxhcmdlIHNjYWxlIGRhdGFzZXRzLCBhZGRyZXNzaW5nIGEgbnVtYmVyIG9mIGludGVyZXN0aW5nIHF1ZXN0aW9ucyByZWxhdGVkIHRvIGRlZXAgbGVhcm5pbmcgYW5kIG11bHRpLXRhc2sgbGVhcm5pbmcuIEZ1cnRoZXJtb3JlLCB0aGlzIHdvcmsgY2FuIHByb3ZpZGUgYSBuZXcgbGFyZ2Ugc2NhbGUgYmVuY2htYXJrIGRhdGFzZXQgKGJleW9uZCBNTklTVCkgZm9yIGRlZXAgbGVhcm5pbmcgYW5kIG1hY2hpbmUgbGVhcm5pbmcgcmVzZWFyY2guDTxiciAvPmQCCA8PFgIfAGhkZAIIDxUBAGQYAgUfY3RsMDAkY3BoJGd2UmV2aWV3cyRjdGwwMSRjdGwwMA88KwAKAQgCAWQFH2N0bDAwJGNwaCRndlJldmlld3MkY3RsMDAkY3RsMDAPPCsACgEIAgFkh0ly6l5rRpe9mdRnffXYAZKa1+8=">
+</div>
+
+<table id="header">
+<tbody><tr>
+<td><img src="./ReviewsAISTATS_files/conferencelogo.gif"></td>
+<td width="100%"><a href="http://www.aistats.org/">AI &amp; Statistics 2011 </a><br><b>Fourteenth International Conference on Artificial Intelligence and Statistics </b><br>April 11-13, 2011<br>Ft. Lauderdale, FL<br>USA</td>
+</tr>
+</tbody></table>
+<table id="content"><tbody><tr><td class="contentBorder">&nbsp;</td><td class="contentContainer">
+<span id="ctl00_cph_Label4" style="font-size:Small;font-weight:bold;">Reviews For Paper</span>
+<span id="ctl00_cph_lblErrorMessage" class="error" style="font-size:Small;"></span>
+<div id="ctl00_cph_pnlReviews">
+	
+    <span style="font-size:Small;">
+<table class="nicetable2" style="text-align:left; width: 100%;">
+    
+    <tbody><tr>
+        <td width="100px"><b>Paper ID</b></td>
+        <td><span id="ctl00_cph_infoSubmission_lblPaperId" style="font-size:Small;">126</span></td>
+    </tr>
+    <tr>
+        <td><b>Title</b></td>
+        <td><span id="ctl00_cph_infoSubmission_lblPaperTitle" style="font-size:Small;">Deep Learners Benefit More from Out-of-Distribution Examples</span></td>
+    </tr>
+    
+    
+    
+    
+    
+</tbody></table></span>
+    
+    
+            <hr>
+            <table>
+                <tbody><tr>
+                    <td>
+                        <span id="ctl00_cph_gvReviews_ctl00_Label2" style="font-size:Small;font-weight:bold;">Masked Reviewer ID:</span>
+                    </td>
+                    <td>
+                        <span id="ctl00_cph_gvReviews_ctl00_Label1" style="font-size:Small;">Assigned_Reviewer_2</span>
+                    </td>
+                </tr>
+                <tr>
+                    <td>
+                        <span id="ctl00_cph_gvReviews_ctl00_Label3" style="font-size:Small;font-weight:bold;">Review:</span>
+                    </td>
+                    <td>
+                    </td>
+                </tr>
+            </tbody></table>
+            <div>
+		<table cellspacing="0" cellpadding="4" rules="all" border="1" style="color:#333333;border-width:1px;border-style:None;font-family:Verdana;font-size:Small;border-collapse:collapse;">
+			<tbody><tr style="color:White;background-color:#5D7B9D;font-weight:bold;">
+				<th scope="col">Question</th><th scope="col">&nbsp;</th>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Overall rating: please synthesize your answers to other questions into an overall recommendation.  Please take into account tradeoffs (an increase in one measure may compensate for a decrease in another), and describe the tradeoffs in the detailed comments.</td><td style="width:80%;">
+                            Good: suggest accept
+                        </td>
+			</tr><tr style="color:#284775;background-color:White;">
+				<td style="width:20%;">Technical quality: is all included material presented clearly and correctly?</td><td style="width:80%;">
+                            Good
+                        </td>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Originality: how much new work is represented in this paper, beyond previous conference/journal papers?</td><td style="width:80%;">
+                            Substantial new material
+                        </td>
+			</tr><tr style="color:#284775;background-color:White;">
+				<td style="width:20%;">Interest and significance: would the paper's goal, if completely solved, represent a substantial advance for the AISTATS community?</td><td style="width:80%;">
+                            Significant
+                        </td>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Thoroughness: to what degree does the paper support its conclusions through experimental comparisons, theorems, etc.?</td><td style="width:80%;">
+                            Thorough
+                        </td>
+			</tr><tr style="color:#284775;background-color:White;">
+				<td style="width:20%;">Creativity: to what degree does the paper represent a novel way of setting up a problem or an unusual approach to solving it?</td><td style="width:80%;">
+                            Most content represents application of known ideas
+                        </td>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Detailed Comments</td><td style="width:80%;">
+                            This paper shows that deep networks benefit more from out-of-distribution examples than shallower architectures on a large scale character recognition experiment. A thorough empirical validation shows that deep nets produce better discrimination (than shallower nets) when trained with distorted characters and when trained on multiple tasks. 
+<br>Although the methods used are already well established in the community, these results are significant and provide new insights on the representational power of this class of methods.
+<br>
+<br>Suggestions:
+<br>- it would be interesting to compare the deep architecture and the shallow architecture for a given capacity of the model (i.e. use wider shallow net)
+<br>- since the authors use denoising autoencoders to pre-train deep networks, they could consider to use distorted characters as noisy inputs instead of artificially set to 0 some inputs. This might help learning more representations that are more robust to distortions that are actually useful for discrimination.
+                        </td>
+			</tr>
+		</tbody></table>
+	</div>
+            
+        
+            <hr>
+            <table>
+                <tbody><tr>
+                    <td>
+                        <span id="ctl00_cph_gvReviews_ctl01_Label2" style="font-size:Small;font-weight:bold;">Masked Reviewer ID:</span>
+                    </td>
+                    <td>
+                        <span id="ctl00_cph_gvReviews_ctl01_Label1" style="font-size:Small;">Assigned_Reviewer_3</span>
+                    </td>
+                </tr>
+                <tr>
+                    <td>
+                        <span id="ctl00_cph_gvReviews_ctl01_Label3" style="font-size:Small;font-weight:bold;">Review:</span>
+                    </td>
+                    <td>
+                    </td>
+                </tr>
+            </tbody></table>
+            <div>
+		<table cellspacing="0" cellpadding="4" rules="all" border="1" style="color:#333333;border-width:1px;border-style:None;font-family:Verdana;font-size:Small;border-collapse:collapse;">
+			<tbody><tr style="color:White;background-color:#5D7B9D;font-weight:bold;">
+				<th scope="col">Question</th><th scope="col">&nbsp;</th>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Overall rating: please synthesize your answers to other questions into an overall recommendation.  Please take into account tradeoffs (an increase in one measure may compensate for a decrease in another), and describe the tradeoffs in the detailed comments.</td><td style="width:80%;">
+                            Very good: suggest accept
+                        </td>
+			</tr><tr style="color:#284775;background-color:White;">
+				<td style="width:20%;">Technical quality: is all included material presented clearly and correctly?</td><td style="width:80%;">
+                            Very good
+                        </td>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Originality: how much new work is represented in this paper, beyond previous conference/journal papers?</td><td style="width:80%;">
+                            Substantial new material
+                        </td>
+			</tr><tr style="color:#284775;background-color:White;">
+				<td style="width:20%;">Interest and significance: would the paper's goal, if completely solved, represent a substantial advance for the AISTATS community?</td><td style="width:80%;">
+                            Significant
+                        </td>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Thoroughness: to what degree does the paper support its conclusions through experimental comparisons, theorems, etc.?</td><td style="width:80%;">
+                            Thorough
+                        </td>
+			</tr><tr style="color:#284775;background-color:White;">
+				<td style="width:20%;">Creativity: to what degree does the paper represent a novel way of setting up a problem or an unusual approach to solving it?</td><td style="width:80%;">
+                            Most content represents novel approaches
+                        </td>
+			</tr><tr style="color:#333333;background-color:#F7F6F3;">
+				<td style="width:20%;">Detailed Comments</td><td style="width:80%;">
+                            This paper claims that using out-of-distribution examples can be more helpful in training deep architectures than shallow architectures. In order to test this hypothesis, the paper develops extensive transformations for image patches (i.e., images of handwritten characters) to generate a large-scale dataset of perturbed images. These out-of-distribution examples are trained using MLPs and stacked denoising auto-encoders (SDAs). In the experiments, the paper shows that SDAs outperform MLPs, achieving human-level performance for NIST dataset. The paper also provides two interesting experiments showing that: (1) SDAs can benefit from training perturbed data, even when testing on clean data; (2) SDAs can significantly benefit from multi-task learning.
+<br>
+<br>
+<br>Questions, comments, and suggestions:
+<br>1. Regarding the human labeling, I have some concerns about labeling noise/biases due to AMT. How were the anomalies in labeling or outliers controlled? Was there any procedure to minimize labeling noise/biases or to ensure that human labelers tried their best (e.g., filtering out random guesses or encouraging the labelers to consider all possibilities carefully before providing premature guesses)? For example, multi-stage questionnaires (e.g., asking "characters/digits", "uppercase/lowercase", then choosing one out of 10 digits, or 26 characters) might significantly reduce labeling noise/biases, rather than showing 62 candidate answers simultaneously.
+<br>
+<br>2. It seems that the paper fixed the number of hidden layers as three. Despite good performance of the proposed architecture, it is somewhat unclear whether the benefit comes mainly from deep architecture or the use of denoising auto-encoders.
+<br>
+<br>Therefore, it will be more interesting to see the effect of the number of layers and other pre-training methods (e.g., RBMs or auto-encoders). This experiment will clarify where the benefit comes from (i.e., deep architecture vs. pre-training modules) and provide more insights about the results.
+<br>
+<br>3. The paper briefly mentioned about the use of libSVM, but it will be useful to compare against the results using online SVM (e.g., PEGASOS).
+<br>
+<br>4. The paper also talks about the effect of large labeled data in self-taught learning setting. To strengthen the claim, it will be helpful to show the test accuracy as a function of number of labeled examples.
+<br>
+<br>Overall, the paper is clearly written, and it provides interesting experiments on large scale datasets, addressing a number of interesting questions related to deep learning and multi-task learning. Furthermore, this work can provide a new large scale benchmark dataset (beyond MNIST) for deep learning and machine learning research.
+<br>
+                        </td>
+			</tr>
+		</tbody></table>
+	</div>
+            
+        
+    <br>
+    <br>
+
+</div>
+</td><td class="contentBorder">&nbsp;</td></tr></tbody></table>
+</form>
+
+
+</body></html>
\ No newline at end of file
--- a/writeup/aigaion-shorter.bib	Sat Mar 19 22:49:33 2011 -0400
+++ b/writeup/aigaion-shorter.bib	Sat Mar 19 22:51:40 2011 -0400
@@ -1,5 +1,99 @@
 %Aigaion2 BibTeX export from LISA - Publications
-%Tuesday 01 June 2010 10:46:52 AM
+%Tuesday 02 November 2010 04:10:50 PM
+@MASTERSTHESIS{,
+    author = {Breuleux, Olivier},
+     title = {{\'{E}}chantillonnage dynamique de champs markoviens},
+      year = {2010},
+    school = {Universit{\'{e}} de Montr{\'{e}}al}
+}
+
+@PHDTHESIS{,
+    author = {Rivest, Fran{\c c}ois},
+     title = {Mod{\`{e}}le informatique du coapprentissage des ganglions de la base et du cortex : L’apprentissage par renforcement et le d{\'{e}}veloppement de repr{\'{e}}sentations},
+      year = {2009},
+    school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d’informatique et de recherche op{\'{e}}rationnelle},
+  abstract = {English follow:
+
+Tout au long de la vie, le cerveau d{\'{e}}veloppe des repr{\'{e}}sentations de son
+environnement permettant {\`{a}} l’individu d’en tirer meilleur profit. Comment ces
+repr{\'{e}}sentations se d{\'{e}}veloppent-elles pendant la qu{\^{e}}te de r{\'{e}}compenses demeure un
+myst{\`{e}}re. Il est raisonnable de penser que le cortex est le si{\`{e}}ge de ces repr{\'{e}}sentations
+et que les ganglions de la base jouent un r{\^{o}}le important dans la maximisation des
+r{\'{e}}compenses. En particulier, les neurones dopaminergiques semblent coder un signal
+d’erreur de pr{\'{e}}diction de r{\'{e}}compense. Cette th{\`{e}}se {\'{e}}tudie le probl{\`{e}}me en construisant,
+{\`{a}} l’aide de l’apprentissage machine, un mod{\`{e}}le informatique int{\'{e}}grant de nombreuses
+{\'{e}}vidences neurologiques.
+        Apr{\`{e}}s une introduction au cadre math{\'{e}}matique et {\`{a}} quelques algorithmes de
+l’apprentissage machine, un survol de l’apprentissage en psychologie et en
+neuroscience et une revue des mod{\`{e}}les de l’apprentissage dans les ganglions de la
+base, la th{\`{e}}se comporte trois articles. Le premier montre qu’il est possible
+d’apprendre {\`{a}} maximiser ses r{\'{e}}compenses tout en d{\'{e}}veloppant de meilleures
+repr{\'{e}}sentations des entr{\'{e}}es. Le second article porte sur l'important probl{\`{e}}me toujours
+non r{\'{e}}solu de la repr{\'{e}}sentation du temps. Il d{\'{e}}montre qu’une repr{\'{e}}sentation du temps
+peut {\^{e}}tre acquise automatiquement dans un r{\'{e}}seau de neurones artificiels faisant
+office de m{\'{e}}moire de travail. La repr{\'{e}}sentation d{\'{e}}velopp{\'{e}}e par le mod{\`{e}}le ressemble
+beaucoup {\`{a}} l’activit{\'{e}} de neurones corticaux dans des t{\^{a}}ches similaires. De plus, le
+mod{\`{e}}le montre que l’utilisation du signal d’erreur de r{\'{e}}compense peut acc{\'{e}}l{\'{e}}rer la
+construction de ces repr{\'{e}}sentations temporelles. Finalement, il montre qu’une telle
+repr{\'{e}}sentation acquise automatiquement dans le cortex peut fournir l’information
+n{\'{e}}cessaire aux ganglions de la base pour expliquer le signal dopaminergique. Enfin,
+le troisi{\`{e}}me article {\'{e}}value le pouvoir explicatif et pr{\'{e}}dictif du mod{\`{e}}le sur diff{\'{e}}rentes
+situations comme la pr{\'{e}}sence ou l’absence d’un stimulus (conditionnement classique
+ou de trace) pendant l’attente de la r{\'{e}}compense. En plus de faire des pr{\'{e}}dictions tr{\`{e}}s
+int{\'{e}}ressantes en lien avec la litt{\'{e}}rature sur les intervalles de temps, l’article r{\'{e}}v{\`{e}}le
+certaines lacunes du mod{\`{e}}le qui devront {\^{e}}tre am{\'{e}}lior{\'{e}}es.
+       Bref, cette th{\`{e}}se {\'{e}}tend les mod{\`{e}}les actuels de l’apprentissage des ganglions de
+la base et du syst{\`{e}}me dopaminergique au d{\'{e}}veloppement concurrent de
+repr{\'{e}}sentations temporelles dans le cortex et aux interactions de ces deux structures.
+
+        Throughout lifetime, the brain develops abstract representations of its
+environment that allow the individual to maximize his benefits. How these
+representations are developed while trying to acquire rewards remains a mystery. It is
+reasonable to assume that these representations arise in the cortex and that the basal
+ganglia are playing an important role in reward maximization. In particular,
+dopaminergic neurons appear to code a reward prediction error signal. This thesis
+studies the problem by constructing, using machine learning tools, a computational
+model that incorporates a number of relevant neurophysiological findings.
+        After an introduction to the machine learning framework and to some of its
+algorithms, an overview of learning in psychology and neuroscience, and a review of
+models of learning in the basal ganglia, the thesis comprises three papers. The first
+article shows that it is possible to learn a better representation of the inputs while
+learning to maximize reward. The second paper addresses the important and still
+unresolved problem of the representation of time in the brain. The paper shows that a
+time representation can be acquired automatically in an artificial neural network
+acting like a working memory. The representation learned by the model closely
+resembles the activity of cortical neurons in similar tasks. Moreover, the model shows
+that the reward prediction error signal could accelerate the development of the
+temporal representation. Finally, it shows that if such a learned representation exists
+in the cortex, it could provide the necessary information to the basal ganglia to
+explain the dopaminergic signal. The third article evaluates the explanatory and
+predictive power of the model on the effects of differences in task conditions such as
+the presence or absence of a stimulus (classical versus trace conditioning) while
+waiting for the reward. Beyond making interesting predictions relevant to the timing
+literature, the paper reveals some shortcomings of the model that will need to be
+resolved.
+       In summary, this thesis extends current models of reinforcement learning of
+the basal ganglia and the dopaminergic system to the concurrent development of
+representation in the cortex and to the interactions between these two regions.}
+}
+
+@MASTERSTHESIS{,
+    author = {Wood, Sean},
+     title = {Non-negative matrix decomposition approaches to frequency domain analysis of music audio signals},
+      year = {2010},
+    school = {Universit{\'{e}} de Montr{\'{e}}al}
+}
+
+@TECHREPORT{ARXIV-2010,
+       author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Bengio, Yoshua and Bergeron, Arnaud and Boulanger-Lewandowski, Nicolas and Breuel, Thomas and Chherawala, Youssouf and Cisse, Moustapha and C{\^{o}}t{\'{e}}, Myriam and Erhan, Dumitru and Eustache, Jeremy and Glorot, Xavier and Muller, Xavier and Pannetier Lebeuf, Sylvain and Pascanu, Razvan and Rifai, Salah and Savard, Fran{\c c}ois and Sicard, Guillaume},
+     keywords = {Computer Vision and Pattern Recognition, Learning, Neural and Evolutionary Computing},
+        title = {Deep Self-Taught Learning for Handwritten Character Recognition},
+       number = {1353},
+         year = {2010},
+  institution = {University of Montr{\'{e}}al},
+     abstract = {Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition.}
+}
+
 @INPROCEEDINGS{Attardi+al-2009,
      author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
    keywords = {classifier, dependency parsing, natural language, parser, perceptron},
@@ -409,7 +503,7 @@
     volume = {5},
       year = {2004},
      pages = {1089--1105},
-  journal = {Journal of Machine Learning Research},
+  crossref = {JMLR-shorter},
   abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
 topics={Comparative},cat={J},
 }
@@ -1089,7 +1183,7 @@
     volume = {3},
       year = {2003},
      pages = {1137--1155},
-  journal = {Journal of Machine Learning Research},
+  crossref = {JMLR-shorter},
   abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
 topics={Markov,Unsupervised,Language},cat={J},
 }
@@ -1232,12 +1326,20 @@
     The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.}
 }
 
-@MISC{bergstra+al:2010-scipy,
-        author = {Bergstra, James},
-         title = {Optimized Symbolic Expressions and {GPU} Metaprogramming with Theano},
-          year = {2010},
-  howpublished = {{SciPy}},
-          note = {Oral}
+@ARTICLE{Bergstra+al-2010,
+    author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
+     title = {Suitability of V1 Energy Models for Object Classification},
+   journal = {Neural Computation},
+      year = {2010},
+      note = {to appear}
+}
+
+@INPROCEEDINGS{bergstra+al:2010-scipy,
+     author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Pascanu, Razvan and Desjardins, Guillaume and Turian, Joseph and Bengio, Yoshua},
+      title = {Theano: a {CPU} and {GPU} Math Expression Compiler},
+  booktitle = {Proceedings of the Python for Scientific Computing Conference ({SciPy})},
+       year = {2010},
+       note = {Oral}
 }
 
 @MISC{bergstra+al:2010-sharcnet,
@@ -1257,10 +1359,13 @@
 }
 
 @INPROCEEDINGS{Bergstra+Bengio-2009,
-    author = {Bergstra, James and Bengio, Yoshua},
-     title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
-      year = {2009},
-  crossref = {NIPS22}
+     author = {Bergstra, James and Bengio, Yoshua},
+      title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
+       year = {2009},
+      pages = {99--107},
+  publisher = {MIT Press},
+        url = {http://books.nips.cc/papers/files/nips22/NIPS2009_0933.pdf},
+   crossref = {NIPS22}
 }
 
 @ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006,
@@ -1279,8 +1384,10 @@
 @INPROCEEDINGS{bergstra+lacoste+eck:2006,
      author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas},
       title = {Predicting Genre Labels for Artists using FreeDB},
-  booktitle = {Proc. 7th International Conference on Music Information Retrieval (ISMIR)},
+  booktitle = {Proc. 7th International Conference on Music Information Retrieval ({ISMIR})},
        year = {2006},
+      pages = {85--88},
+  publisher = {University of Victoria},
 SOURCE = {OwnPublication},
   PDF = {papers/2006_ismir_freedb.pdf},
 }
@@ -1290,7 +1397,7 @@
       title = {Scalable Genre and Tag Prediction with Spectral Covariance},
   booktitle = {{ISMIR}},
        year = {2010},
-       note = {accepted}
+      pages = {507--512},
 }
 
 @MASTERSTHESIS{Bergstra-Msc-2006,
@@ -1391,7 +1498,7 @@
        year = {1997},
       pages = {490--494},
   publisher = {IEEE},
-        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.ps.gz},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.pdf},
 topics={PriorKnowledge,Speech},cat={C},
 }
 
@@ -1431,6 +1538,15 @@
     school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel}
 }
 
+@TECHREPORT{Breuleux+al-TR-2010,
+       author = {Breuleux, Olivier and Bengio, Yoshua and Vincent, Pascal},
+        title = {Unlearning for Better Mixing},
+       number = {1349},
+         year = {2010},
+  institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO},
+     abstract = {Two learning algorithms were recently proposed – Herding and Fast Persistent Contrastive Divergence (FPCD) – which share the following interesting characteristic: they exploit changes in the model parameters while sampling in order to escape modes and mix better, during the sampling process that is part of the learning algorithm. We first justify such approaches as ways to escape modes while approximately keeping the same asymptotic distribution of the {Markov} chain. We then extend FPCD using an idea borrowed from Herding in order to obtain a pure sampling algorithm and show empirically that this FPCD-sampler yields substantially better samples than Gibbs sampling. Because these algorithms entangle the model and the sampling algorithm and we want to evaluate both (but particularly how well the sampling schemes mix), it is not always easy to evaluate them, so we propose a “black-box” approach based on how well and how quickly the samples generated by a model “cover” the test set examples. We empirically study these algorithms and variations with this perspective and these new evaluation tools in order to better understand their strengths and limitations.}
+}
+
 @INPROCEEDINGS{Carreau+Bengio-2007,
      author = {Carreau, Julie and Bengio, Yoshua},
       title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data},
@@ -1444,7 +1560,7 @@
 
 @ARTICLE{Carreau+Bengio-2009,
     author = {Carreau, Julie and Bengio, Yoshua},
-     title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distributio\ n},
+     title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distribution},
    journal = {IEEE Transactions on Neural Networks},
     volume = {20},
     number = {7},
@@ -1569,7 +1685,7 @@
     author = {Bengio, Yoshua and Chapados, Nicolas},
      title = {Extensions to Metric-Based Model Selection},
       year = {2003},
-  journal = {Journal of Machine Learning Research},
+  crossref = {JMLR-shorter},
   abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
 topics={ModelSelection,Finance},cat={J},
 }
@@ -1789,10 +1905,10 @@
 @INPROCEEDINGS{Desjardins+al-2010,
      author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua},
       title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine},
-  booktitle = {Proceedings of AISTATS 2010},
+  booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)},
      volume = {9},
        year = {2010},
-      pages = {145-152},
+      pages = {145--152},
    abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.}
 }
 
@@ -2255,7 +2371,7 @@
     volume = {11},
       year = {2010},
      pages = {625--660},
-  journal = {Journal of Machine Learning Research},
+  crossref = {JMLR-shorter},
   abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
 }
 
@@ -2298,6 +2414,29 @@
 pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.}
 }
 
+@TECHREPORT{Erhan-vis-techreport-2010,
+       author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua},
+        title = {Understanding Representations Learned in Deep Architectures},
+       number = {1355},
+         year = {2010},
+  institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO},
+     abstract = {Deep architectures have demonstrated state-of-the-art performance in a variety of
+settings, especially with vision datasets. Deep learning algorithms are based on learning
+several levels of representation of the input. Beyond test-set performance, there
+is a need for qualitative comparisons of the solutions learned by various deep architectures,
+focused on those learned representations. One of the goals of our research
+is to improve tools for finding good qualitative interpretations of high level features
+learned by such models. We also seek to gain insight into the invariances learned by
+deep networks. To this end, we contrast and compare several techniques for finding
+such interpretations. We applied our techniques on Stacked Denoising Auto-Encoders
+and Deep Belief Networks, trained on several vision datasets. We show that consistent
+filter-like interpretation is possible and simple to accomplish at the unit level. The tools
+developed make it possible to analyze deep models in more depth and accomplish the
+tracing of invariance manifolds for each of the hidden units. We hope that such techniques
+will allow researchers in deep architectures to understand more of how and why
+deep architectures work.}
+}
+
 @INPROCEEDINGS{Erhan2009,
     author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
   keywords = {Deep Networks},
@@ -2754,6 +2893,11 @@
            url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
 }
 
+@ARTICLE{JMLR-short,
+   journal = {JMLR},
+      year = {-1}
+}
+
 
 @INPROCEEDINGS{Kegl+Bertin+Eck-2008,
      author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
@@ -2833,8 +2977,10 @@
     author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph},
      title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest},
    journal = {Neural Computation},
+    volume = {22},
+    number = {9},
       year = {2010},
-      note = {To appear}
+     pages = {2285--2307}
 }
 
 @INPROCEEDINGS{Larochelle+Bengio-2008,
@@ -2865,7 +3011,7 @@
     volume = {10},
       year = {2009},
      pages = {1--40},
-  journal = {Journal of Machine Learning Research},
+  crossref = {JMLR-shorter},
   abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
 }
 
@@ -3029,8 +3175,12 @@
     author = {Le Roux, Nicolas and Bengio, Yoshua},
      title = {Deep Belief Networks are Compact Universal Approximators},
    journal = {Neural Computation},
+    volume = {22},
+    number = {8},
       year = {2010},
-      note = {To appear}
+     pages = {2192-2207},
+      issn = {0899-7667},
+  abstract = {Deep Belief Networks (DBN) are generative models with many layers of hidden causal variables, recently introduced by Hinton et al. (2006), along with a greedy layer-wise unsupervised learning algorithm. Building on Le Roux and Bengio (2008) and Sutskever and Hinton (2008), we show that deep but narrow generative networks do not require more parameters than shallow ones to achieve universal approximation. Exploiting the proof technique, we prove that deep but narrow feed-forward neural networks with sigmoidal units can represent any Boolean expression.}
 }
 
 @TECHREPORT{LeRoux-Bengio-2007-TR,
@@ -3986,7 +4136,7 @@
      title = {The Need for Open Source Software in Machine Learning.},
       year = {2007},
       note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
-  journal = {Journal of Machine Learning Research},
+  crossref = {JMLR-shorter},
   abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.
 
 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
@@ -4004,6 +4154,16 @@
 topics={Mining},cat={J},
 }
 
+@PHDTHESIS{ThesisChapados2010,
+    author = {Chapados, Nicolas},
+     title = {Sequential Machine learning Approaches for Portfolio Management},
+      year = {2010},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {[English follow]
+Cette th{\`{e}}se envisage un ensemble de m{\'{e}}thodes permettant aux algorithmes d'apprentissage statistique de mieux traiter la nature s{\'{e}}quentielle des probl{\`{e}}mes de gestion de portefeuilles financiers. Nous d{\'{e}}butons par une consid{\'{e}}ration du probl{\`{e}}me g{\'{e}}n{\'{e}}ral de la composition d'algorithmes d'apprentissage devant g{\'{e}}rer des t{\^{a}}ches s{\'{e}}quentielles, en particulier celui de la mise-{\`{a}}-jour efficace des ensembles d'apprentissage dans un cadre de validation s{\'{e}}quentielle. Nous {\'{e}}num{\'{e}}rons les desiderata que des primitives de composition doivent satisfaire, et faisons ressortir la difficult{\'{e}} de les atteindre de fa{\c c}on rigoureuse et efficace. Nous poursuivons en pr{\'{e}}sentant un ensemble d'algorithmes qui atteignent ces objectifs et pr{\'{e}}sentons une {\'{e}}tude de cas d'un syst{\`{e}}me complexe de prise de d{\'{e}}cision financi{\`{e}}re utilisant ces techniques. Nous d{\'{e}}crivons ensuite une m{\'{e}}thode g{\'{e}}n{\'{e}}rale permettant de transformer un probl{\`{e}}me de d{\'{e}}cision s{\'{e}}quentielle non-Markovien en un probl{\`{e}}me d'apprentissage supervis{\'{e}} en employant un algorithme de recherche bas{\'{e}} sur les K meilleurs chemins. Nous traitons d'une application en gestion de portefeuille o{\`{u}} nous entra{\^{\i}}nons un algorithme d'apprentissage {\`{a}} optimiser directement un ratio de Sharpe (ou autre crit{\`{e}}re non-additif incorporant une aversion au risque). Nous illustrons l'approche par une {\'{e}}tude exp{\'{e}}rimentale approfondie, proposant une architecture de r{\'{e}}seaux de neurones sp{\'{e}}cialis{\'{e}}e {\`{a}} la gestion de portefeuille et la comparant {\`{a}} plusieurs alternatives. Finalement, nous introduisons une repr{\'{e}}sentation fonctionnelle de s{\'{e}}ries chronologiques permettant {\`{a}} des pr{\'{e}}visions d'{\^{e}}tre effectu{\'{e}}es sur un horizon variable, tout en utilisant un ensemble informationnel r{\'{e}}v{\'{e}}l{\'{e}} de mani{\`{e}}re progressive. L'approche est bas{\'{e}}e sur l'utilisation des processus Gaussiens, lesquels fournissent une matrice de covariance compl{\`{e}}te entre tous les points pour lesquels une pr{\'{e}}vision est demand{\'{e}}e. Cette information est utilis{\'{e}}e {\`{a}} bon escient par un algorithme qui transige activement des {\'{e}}carts de cours (price spreads) entre des contrats {\`{a}} terme sur commodit{\'{e}}s. L'approche propos{\'{e}}e produit, hors {\'{e}}chantillon, un rendement ajust{\'{e}} pour le risque significatif, apr{\`{e}}s frais de transactions, sur un portefeuille de 30 actifs.
+This thesis considers a number of approaches to make machine learning algorithms better suited to the sequential nature of financial portfolio management tasks. We start by considering the problem of the general composition of learning algorithms that must handle temporal learning tasks, in particular that of creating and efficiently updating the training sets in a sequential simulation framework. We enumerate the desiderata that composition primitives should satisfy, and underscore the difficulty of rigorously and efficiently reaching them. We follow by introducing a set of algorithms that accomplish the desired objectives, presenting a case-study of a real-world complex learning system for financial decision-making that uses those techniques. We then describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-best paths search algorithm. We consider an application in financial portfolio management where we train a learning algorithm to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating extensive experimental results using a neural network architecture specialized for portfolio management and compare against well-known alternatives. Finally, we introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
+}
+
 @ARTICLE{Thierry+al-2008,
     author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul},
      title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases},
@@ -4324,6 +4484,9 @@
       title = {Quadratic Features and Deep Architectures for Chunking},
   booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)},
        year = {2009},
+      pages = {245--248},
+  publisher = {Association for Computational Linguistics},
+        url = {http://www.aclweb.org/anthology/N/N09/N09-2062},
    abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
 }
 
@@ -4339,8 +4502,10 @@
 @INPROCEEDINGS{Turian+Ratinov+Bengio-2010,
      author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
       title = {Word representations: A simple and general method for semi-supervised learning},
-  booktitle = {Association for Computational Linguistics(ACL2010)},
-       year = {2010}
+  booktitle = {Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics(ACL2010)},
+       year = {2010},
+      pages = {384--394},
+  publisher = {Association for Computational Linguistics},
 }
 
 @INPROCEEDINGS{Vincent-Bengio-2003,
@@ -4353,6 +4518,14 @@
 topics={HighDimensional,Kernel,Unsupervised},cat={C},
 }
 
+@ARTICLE{Vincent-JMLR-2010,
+    author = {Vincent, Pascal and Larochelle, Hugo and Lajoie, Isabelle and Bengio, Yoshua and Manzagol, Pierre-Antoine},
+     title = {Stacked Denoising Autoencoders: learning useful representations in a deep network with a local denoising criterion},
+   journal = {JMLR},
+      year = {2010},
+      note = {to appear}
+}
+
 @TECHREPORT{Vincent-TR1316,
        author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
         title = {Extracting and Composing Robust Features with Denoising Autoencoders},
@@ -4461,6 +4634,11 @@
   publisher = {MIT Press}
 }
 
+@ARTICLE{JMLR,
+   journal = {Journal of Machine Learning Research},
+      year = {-1}
+}
+
 @INPROCEEDINGS{NIPS19,
      editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
       title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
@@ -4552,19 +4730,19 @@
        year = {-1}
 }
 
-@INPROCEEDINGS{ICML08,
+@PROCEEDINGS{ICML08,
      editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
       title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
   booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
-       year = {-1},
+       year = {2008},
   publisher = {ACM}
 }
 
-@INPROCEEDINGS{ICML07,
+@PROCEEDINGS{ICML07,
      editor = {Ghahramani, Zoubin},
       title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
   booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
-       year = {-1},
+       year = {2007},
   publisher = {ACM}
 }
 
@@ -4692,6 +4870,10 @@
        year = {-1},
   publisher = {Morgan Kaufmann}
 }
+@ARTICLE{JMLR-shorter,
+   journal = {JMLR},
+      year = {-1}
+}
 @INPROCEEDINGS{NIPS1-shorter,
       title = {NIPS'88},
   booktitle = {NIPS 1},
@@ -4826,5 +5008,3 @@
   booktitle = {AISTATS'2009},
        year = {-1}
 }
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/aistats2011_cameraready.tex	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,898 @@
+%\documentclass[twoside,11pt]{article} % For LaTeX2e
+\documentclass{article} % For LaTeX2e
+\usepackage[accepted]{aistats2e_2011}
+%\usepackage{times}
+\usepackage{wrapfig}
+\usepackage{amsthm}
+\usepackage{amsmath}
+\usepackage{bbm}
+\usepackage[utf8]{inputenc}
+\usepackage[psamsfonts]{amssymb}
+%\usepackage{algorithm,algorithmic} % not used after all
+\usepackage{graphicx,subfigure}
+\usepackage{natbib}
+
+\addtolength{\textwidth}{10mm}
+\addtolength{\evensidemargin}{-5mm}
+\addtolength{\oddsidemargin}{-5mm}
+
+%\setlength\parindent{0mm}
+
+\begin{document}
+
+\twocolumn[
+\aistatstitle{Deep Learners Benefit More from Out-of-Distribution Examples}
+\runningtitle{Deep Learners for Out-of-Distribution Examples}
+\runningauthor{Bengio et. al.}
+\aistatsauthor{
+Yoshua  Bengio \and
+Frédéric  Bastien \and
+\bf Arnaud  Bergeron \and
+Nicolas  Boulanger-Lewandowski \and \\
+\bf Thomas  Breuel \and
+Youssouf  Chherawala \and
+\bf Moustapha  Cisse \and 
+Myriam  Côté \and  \\
+\bf Dumitru  Erhan \and
+Jeremy  Eustache \and
+\bf Xavier  Glorot \and 
+Xavier  Muller \and \\
+\bf Sylvain  Pannetier Lebeuf \and
+Razvan  Pascanu \and
+\bf Salah  Rifai \and 
+Francois  Savard \and  \\
+\bf Guillaume  Sicard \\
+\vspace*{1mm}}
+
+%I can't use aistatsaddress in a single side paragraphe.
+%The document is 2 colums, but this section span the 2 colums, sot there is only 1 left
+\center{Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+\vspace*{5mm}
+]
+%\aistatsaddress{Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+
+
+%\vspace*{5mm}}
+%\date{{\tt bengioy@iro.umontreal.ca}, Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+%\jmlrheading{}{2010}{}{10/2010}{XX/2011}{Yoshua Bengio et al}
+%\editor{}
+
+%\makeanontitle
+%\maketitle
+
+%{\bf Running title: Deep Self-Taught Learning}
+
+\vspace*{5mm}
+\begin{abstract}
+  Recent theoretical and empirical work in statistical machine learning has demonstrated the potential of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple levels of representation. The hypothesis evaluated here is that intermediate levels of representation, because they can be shared across tasks and examples from different but related distributions, can yield even more benefits. Comparative experiments were performed on a large-scale handwritten character recognition setting with 62 classes (upper case, lower case, digits), using both a multi-task setting and perturbed examples in order to obtain out-of-distribution examples. The results agree with the hypothesis, and show that a deep learner did {\em beat previously published results and reached human-level performance}.
+\end{abstract}
+%\vspace*{-3mm}
+
+%\begin{keywords}  
+%Deep learning, self-taught learning, out-of-distribution examples, handwritten character recognition, multi-task learning
+%\end{keywords}
+%\keywords{self-taught learning \and multi-task learning \and out-of-distribution examples \and deep learning \and handwriting recognition}
+
+
+
+\section{Introduction}
+%\vspace*{-1mm}
+
+{\bf Deep Learning} has emerged as a promising new area of research in
+statistical machine learning~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,VincentPLarochelleH2008-very-small,ranzato-08,TaylorHintonICML2009,Larochelle-jmlr-2009,Salakhutdinov+Hinton-2009,HonglakL2009,HonglakLNIPS2009,Jarrett-ICCV2009,Taylor-cvpr-2010}. See \citet{Bengio-2009} for a review.
+Learning algorithms for deep architectures are centered on the learning
+of useful representations of data, which are better suited to the task at hand,
+and are organized in a hierarchy with multiple levels.
+This is in part inspired by observations of the mammalian visual cortex, 
+which consists of a chain of processing elements, each of which is associated with a
+different representation of the raw visual input. In fact,
+it was found recently that the features learnt in deep architectures resemble
+those observed in the first two of these stages (in areas V1 and V2
+of visual cortex) \citep{HonglakL2008}, and that they become more and
+more invariant to factors of variation (such as camera movement) in
+higher layers~\citep{Goodfellow2009}.
+It has been hypothesized that learning a hierarchy of features increases the
+ease and practicality of developing representations that are at once
+tailored to specific tasks, yet are able to borrow statistical strength
+from other related tasks (e.g., modeling different kinds of objects). Finally, learning the
+feature representation can lead to higher-level (more abstract, more
+general) features that are more robust to unanticipated sources of
+variance extant in real data.
+
+Whereas a deep architecture can in principle be more powerful than a
+shallow one in terms of representation, depth appears to render the
+training problem more difficult in terms of optimization and local minima.
+It is also only recently that successful algorithms were proposed to
+overcome some of these difficulties.  All are based on unsupervised
+learning, often in an greedy layer-wise ``unsupervised pre-training''
+stage~\citep{Bengio-2009}.  
+The principle is that each layer starting from
+the bottom is trained to represent its input (the output of the previous
+layer). After this
+unsupervised initialization, the stack of layers can be
+converted into a deep supervised feedforward neural network and fine-tuned by
+stochastic gradient descent.
+One of these layer initialization techniques,
+applied here, is the Denoising
+Auto-encoder~(DA)~\citep{VincentPLarochelleH2008-very-small} (see
+Figure~\ref{fig:da}), which performed similarly or 
+better~\citep{VincentPLarochelleH2008-very-small} than previously
+proposed Restricted Boltzmann Machines (RBM)~\citep{Hinton06} 
+in terms of unsupervised extraction
+of a hierarchy of features useful for classification. Each layer is trained
+to denoise its input, creating a layer of features that can be used as
+input for the next layer, forming a Stacked Denoising Auto-encoder (SDA).
+Note that training a Denoising Auto-encoder
+can actually been seen as training a particular RBM by an inductive
+principle different from maximum likelihood~\citep{Vincent-SM-2010}, 
+namely by Score Matching~\citep{Hyvarinen-2005,HyvarinenA2008}. 
+
+Previous comparative experimental results with stacking of RBMs and DAs
+to build deep supervised predictors had shown that they could outperform
+shallow architectures in a variety of settings, especially
+when the data involves complex interactions between many factors of 
+variation~\citep{LarochelleH2007,Bengio-2009}. Other experiments have suggested
+that the unsupervised layer-wise pre-training acted as a useful
+prior~\citep{Erhan+al-2010} that allows one to initialize a deep
+neural network in a relatively much smaller region of parameter space, 
+corresponding to better generalization.
+
+To further the understanding of the reasons for the good performance
+observed with deep learners, we focus here on the following {\em hypothesis}:
+intermediate levels of representation, especially when there are
+more such levels, can be exploited to {\bf share
+statistical strength across different but related types of examples},
+such as examples coming from other tasks than the task of interest
+(the multi-task setting), or examples coming from an overlapping
+but different distribution (images with different kinds of perturbations
+and noises, here). This is consistent with the hypotheses discussed
+in~\citet{Bengio-2009} regarding the potential advantage
+of deep learning and the idea that more levels of representation can
+give rise to more abstract, more general features of the raw input.
+
+This hypothesis is related to a learning setting called
+{\bf self-taught learning}~\citep{RainaR2007}, which combines principles
+of semi-supervised and multi-task learning: the learner can exploit examples
+that are unlabeled and possibly come from a distribution different from the target
+distribution, e.g., from other classes than those of interest. 
+It has already been shown that deep learners can clearly take advantage of
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small},
+but more needed to be done to explore the impact
+of {\em out-of-distribution} examples and of the {\em multi-task} setting
+(one exception is~\citep{CollobertR2008}, which shares and uses unsupervised
+pre-training only with the first layer). In particular the {\em relative
+advantage of deep learning} for these settings has not been evaluated.
+
+
+%
+The {\bf main claim} of this paper is that deep learners (with several levels of representation) can
+{\bf benefit more from out-of-distribution examples than shallow learners} (with a single
+level), both in the context of the multi-task setting and from
+ perturbed examples. Because we are able to improve on state-of-the-art
+performance and reach human-level performance
+on a large-scale task, we consider that this paper is also a contribution
+to advance the application of machine learning to handwritten character recognition.
+More precisely, we ask and answer the following questions:
+
+%\begin{enumerate}
+$\bullet$ %\item 
+Do the good results previously obtained with deep architectures on the
+MNIST digit images generalize to the setting of a similar but much larger and richer
+dataset, the NIST special database 19, with 62 classes and around 800k examples?
+
+$\bullet$ %\item 
+To what extent does the perturbation of input images (e.g. adding
+noise, affine transformations, background images) make the resulting
+classifiers better not only on similarly perturbed images but also on
+the {\em original clean examples}? We study this question in the
+context of the 62-class and 10-class tasks of the NIST special database 19.
+
+$\bullet$ %\item 
+Do deep architectures {\em benefit {\bf more} from such out-of-distribution}
+examples, in particular do they benefit more from 
+examples that are perturbed versions of the examples from the task of interest?
+
+$\bullet$ %\item 
+Similarly, does the feature learning step in deep learning algorithms benefit {\bf more}
+from training with moderately {\em different classes} (i.e. a multi-task learning scenario) than
+a corresponding shallow and purely supervised architecture?
+We train on 62 classes and test on 10 (digits) or 26 (upper case or lower case)
+to answer this question.
+%\end{enumerate}
+
+Our experimental results provide positive evidence towards all of these questions,
+as well as {\bf classifiers that reach human-level performance on 62-class isolated character
+recognition and beat previously published results on the NIST dataset (special database 19)}.
+To achieve these results, we introduce in the next section a sophisticated system
+for stochastically transforming character images and then explain the methodology,
+which is based on training with or without these transformed images and testing on 
+clean ones. 
+Code for generating these transformations as well as for the deep learning 
+algorithms are made available at {\tt http://anonymous.url.net}.%{\tt http://hg.assembla.com/ift6266}.
+
+%\vspace*{-3mm}
+%\newpage
+\section{Perturbed and Transformed Character Images}
+\label{s:perturbations}
+%\vspace*{-2mm}
+
+Figure~\ref{fig:transform} shows the different transformations we used to stochastically
+transform $32 \times 32$ source images (such as the one in Fig.\ref{fig:torig})
+in order to obtain data from a larger distribution which
+covers a domain substantially larger than the clean characters distribution from
+which we start.
+Although character transformations have been used before to
+improve character recognizers, this effort is on a large scale both
+in number of classes and in the complexity of the transformations, hence
+in the complexity of the learning task.
+The code for these transformations (mostly Python) is available at 
+{\tt http://anonymous.url.net}. All the modules in the pipeline (Figure~\ref{fig:transform}) share
+a global control parameter ($0 \le complexity \le 1$) that allows one to modulate the
+amount of deformation or noise introduced. 
+There are two main parts in the pipeline. The first one,
+from thickness to pinch, performs transformations. The second
+part, from blur to contrast, adds different kinds of noise.
+More details can be found in~\citep{ift6266-tr-anonymous}.
+
+\begin{figure*}[ht]
+\centering
+\subfigure[Original]{\includegraphics[scale=0.6]{images/Original.png}\label{fig:torig}}
+\subfigure[Thickness]{\includegraphics[scale=0.6]{images/Thick_only.png}}
+\subfigure[Slant]{\includegraphics[scale=0.6]{images/Slant_only.png}}
+\subfigure[Affine Transformation]{\includegraphics[scale=0.6]{images/Affine_only.png}}
+\subfigure[Local Elastic Deformation]{\includegraphics[scale=0.6]{images/Localelasticdistorsions_only.png}}
+\subfigure[Pinch]{\includegraphics[scale=0.6]{images/Pinch_only.png}}
+%Noise
+\subfigure[Motion Blur]{\includegraphics[scale=0.6]{images/Motionblur_only.png}}
+\subfigure[Occlusion]{\includegraphics[scale=0.6]{images/occlusion_only.png}}
+\subfigure[Gaussian Smoothing]{\includegraphics[scale=0.6]{images/Bruitgauss_only.png}}
+\subfigure[Pixels Permutation]{\includegraphics[scale=0.6]{images/Permutpixel_only.png}}
+\subfigure[Gaussian Noise]{\includegraphics[scale=0.6]{images/Distorsiongauss_only.png}}
+\subfigure[Background Image Addition]{\includegraphics[scale=0.6]{images/background_other_only.png}}
+\subfigure[Salt \& Pepper]{\includegraphics[scale=0.6]{images/Poivresel_only.png}}
+\subfigure[Scratches]{\includegraphics[scale=0.6]{images/Rature_only.png}}
+\subfigure[Grey Level \& Contrast]{\includegraphics[scale=0.6]{images/Contrast_only.png}}
+\caption{Top left (a): example original image. Others (b-o): examples of the effect
+of each transformation module taken separately. Actual perturbed examples are obtained by
+a pipeline of these, with random choices about which module to apply and how much perturbation
+to apply.}
+\label{fig:transform}
+%\vspace*{-2mm}
+\end{figure*}
+
+%\vspace*{-3mm}
+\section{Experimental Setup}
+%\vspace*{-1mm}
+
+Much previous work on deep learning had been performed on
+the MNIST digits task~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,Salakhutdinov+Hinton-2009},
+with 60,000 examples, and variants involving 10,000
+examples~\citep{Larochelle-jmlr-2009,VincentPLarochelleH2008-very-small}.
+The focus here is on much larger training sets, from 10 times to 
+to 1000 times larger, and 62 classes.
+
+The first step in constructing the larger datasets (called NISTP and P07) is to sample from
+a {\em data source}: {\bf NIST} (NIST database 19), {\bf Fonts}, {\bf Captchas},
+and {\bf OCR data} (scanned machine printed characters). See more in 
+Section~\ref{sec:sources} below. Once a character
+is sampled from one of these sources (chosen randomly), the second step is to
+apply a pipeline of transformations and/or noise processes outlined in section \ref{s:perturbations}.
+
+To provide a baseline of error rate comparison we also estimate human performance
+on both the 62-class task and the 10-class digits task.
+We compare the best Multi-Layer Perceptrons (MLP) against
+the best Stacked Denoising Auto-encoders (SDA), when
+both models' hyper-parameters are selected to minimize the validation set error.
+We also provide a comparison against a precise estimate
+of human performance obtained via Amazon's Mechanical Turk (AMT)
+service ({\tt http://mturk.com}). 
+AMT users are paid small amounts
+of money to perform tasks for which human intelligence is required.
+Mechanical Turk has been used extensively in natural language processing and vision.
+%processing \citep{SnowEtAl2008} and vision
+%\citep{SorokinAndForsyth2008,whitehill09}. 
+AMT users were presented
+with 10 character images (from a test set) on a screen
+and asked to label them.
+They were forced to choose a single character class (either among the
+62 or 10 character classes) for each image.
+80 subjects classified 2500 images per (dataset,task) pair.
+Different humans labelers sometimes provided a different label for the same
+example, and we were able to estimate the error variance due to this effect
+because each image was classified by 3 different persons. 
+The average error of humans on the 62-class task NIST test set
+is 18.2\%, with a standard error of 0.1\%.
+We controlled noise in the labelling process by (1)
+requiring AMT workers with a higher than normal average of accepted
+responses ($>$95\%) on other tasks (2) discarding responses that were not
+complete (10 predictions) (3) discarding responses for which for which the
+time to predict was smaller than 3 seconds for NIST (the mean response time
+was 20 seconds) and 6 seconds seconds for NISTP (average response time of
+45 seconds) (4) discarding responses which were obviously wrong (10
+identical ones, or "12345..."). Overall, after such filtering, we kept
+approximately 95\% of the AMT workers' responses.
+
+%\vspace*{-3mm}
+\subsection{Data Sources}
+\label{sec:sources}
+%\vspace*{-2mm}
+
+%\begin{itemize}
+%\item 
+{\bf NIST.}
+Our main source of characters is the NIST Special Database 19~\citep{Grother-1995}, 
+widely used for training and testing character
+recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}. 
+The dataset is composed of 814255 digits and characters (upper and lower cases), with hand checked classifications,
+extracted from handwritten sample forms of 3600 writers. The characters are labelled by one of the 62 classes 
+corresponding to ``0''-``9'',``A''-``Z'' and ``a''-``z''. The dataset contains 8 parts (partitions) of varying complexity. 
+The fourth partition (called $hsf_4$, 82,587 examples), 
+experimentally recognized to be the most difficult one, is the one recommended 
+by NIST as a testing set and is used in our work as well as some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+for that purpose. We randomly split the remainder (731,668 examples) into a training set and a validation set for
+model selection. 
+The performances reported by previous work on that dataset mostly use only the digits.
+Here we use all the classes both in the training and testing phase. This is especially
+useful to estimate the effect of a multi-task setting.
+The distribution of the classes in the NIST training and test sets differs
+substantially, with relatively many more digits in the test set, and a more uniform distribution
+of letters in the test set (whereas in the training set they are distributed
+more like in natural text).
+%\vspace*{-1mm}
+
+%\item 
+{\bf Fonts.} 
+In order to have a good variety of sources we downloaded an important number of free fonts from:
+{\tt http://cg.scs.carleton.ca/\textasciitilde luc/freefonts.html}.
+% TODO: pointless to anonymize, it's not pointing to our work
+Including an operating system's (Windows 7) fonts, there is a total of $9817$ different fonts that we can choose uniformly from.
+The chosen {\tt ttf} file is either used as input of the Captcha generator (see next item) or, by producing a corresponding image, 
+directly as input to our models.
+%\vspace*{-1mm}
+
+%\item 
+{\bf Captchas.}
+The Captcha data source is an adaptation of the \emph{pycaptcha} library (a Python-based captcha generator library) for 
+generating characters of the same format as the NIST dataset. This software is based on
+a random character class generator and various kinds of transformations similar to those described in the previous sections. 
+In order to increase the variability of the data generated, many different fonts are used for generating the characters. 
+Transformations (slant, distortions, rotation, translation) are applied to each randomly generated character with a complexity
+depending on the value of the complexity parameter provided by the user of the data source. 
+%Two levels of complexity are allowed and can be controlled via an easy to use facade class. %TODO: what's a facade class?
+%\vspace*{-1mm}
+
+%\item 
+{\bf OCR data.}
+A large set (2 million) of scanned, OCRed and manually verified machine-printed 
+characters where included as an
+additional source. This set is part of a larger corpus being collected by the Image Understanding
+Pattern Recognition Research group led by Thomas Breuel at University of Kaiserslautern 
+({\tt http://www.iupr.com}), and which will be publicly released.
+%TODO: let's hope that Thomas is not a reviewer! :) Seriously though, maybe we should anonymize this
+%\end{itemize}
+
+%\vspace*{-3mm}
+\subsection{Data Sets}
+%\vspace*{-2mm}
+
+All data sets contain 32$\times$32 grey-level images (values in $[0,1]$) associated with a label
+from one of the 62 character classes.
+%\begin{itemize}
+%\vspace*{-1mm}
+
+%\item 
+{\bf NIST.} This is the raw NIST special database 19~\citep{Grother-1995}. It has
+\{651,668 / 80,000 / 82,587\} \{training / validation / test\} examples.
+%\vspace*{-1mm}
+
+%\item 
+{\bf P07.} This dataset is obtained by taking raw characters from all four of the above sources
+and sending them through the transformation pipeline described in section \ref{s:perturbations}.
+For each new example to generate, a data source is selected with probability $10\%$ from the fonts,
+$25\%$ from the captchas, $25\%$ from the OCR data and $40\%$ from NIST. We apply all the transformations in the
+order given above, and for each of them we sample uniformly a \emph{complexity} in the range $[0,0.7]$.
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\vspace*{-1mm}
+
+%\item 
+{\bf NISTP.} This one is equivalent to P07 (complexity parameter of $0.7$ with the same proportions of data sources)
+  except that we only apply
+  transformations from slant to pinch (see Fig.\ref{fig:transform}(b-f)).
+  Therefore, the character is
+  transformed but no additional noise is added to the image, giving images
+  closer to the NIST dataset. 
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\end{itemize}
+
+\begin{figure*}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}}
+%\vspace*{-2mm}
+\caption{Illustration of the computations and training criterion for the denoising
+auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of
+the layer (i.e. raw input or output of previous layer)
+s corrupted into $\tilde{x}$ and encoded into code $y$ by the encoder $f_\theta(\cdot)$.
+The decoder $g_{\theta'}(\cdot)$ maps $y$ to reconstruction $z$, which
+is compared to the uncorrupted input $x$ through the loss function
+$L_H(x,z)$, whose expected value is approximately minimized during training
+by tuning $\theta$ and $\theta'$.}
+\label{fig:da}
+%\vspace*{-2mm}
+\end{figure*}
+
+%\vspace*{-3mm}
+\subsection{Models and their Hyper-parameters}
+%\vspace*{-2mm}
+
+The experiments are performed using MLPs (with a single
+hidden layer) and deep SDAs.
+\emph{Hyper-parameters are selected based on the {\bf NISTP} validation set error.}
+
+{\bf Multi-Layer Perceptrons (MLP).}  The MLP output estimated 
+\[
+P({\rm class}|{\rm input}=x)
+\]
+with 
+\[
+f(x)={\rm softmax}(b_2+W_2\tanh(b_1+W_1 x)),
+\] 
+i.e., two layers, where 
+\[
+ p={\rm softmax}(a)
+\]
+means that 
+\[
+ p_i(x)=\exp(a_i)/\sum_j \exp(a_j)
+\] 
+representing the probability 
+for class $i$, $\tanh$ is the element-wise
+hyperbolic tangent, $b_i$ are parameter vectors, and $W_i$ are 
+parameter matrices (one per layer). The
+number of rows of $W_1$ is called the number of hidden units (of the
+single hidden layer, here), and
+is one way to control capacity (the main other ways to control capacity are
+the number of training iterations and optionally a regularization penalty
+on the parameters, not used here because it did not help).
+Whereas previous work had compared
+deep architectures to both shallow MLPs and SVMs, we only compared to MLPs
+here because of the very large datasets used (making the use of SVMs
+computationally challenging because of their quadratic scaling
+behavior). Preliminary experiments on training SVMs (libSVM) with subsets
+of the training set allowing the program to fit in memory yielded
+substantially worse results than those obtained with MLPs\footnote{RBF SVMs
+  trained with a subset of NISTP or NIST, 100k examples, to fit in memory,
+  yielded 64\% test error or worse; online linear SVMs trained on the whole
+  of NIST or 800k from NISTP yielded no better than 42\% error; slightly
+  better results were obtained by sparsifying the pixel intensities and
+  projecting to a second-order polynomial (a very sparse vector), still
+  41\% error. We expect that better results could be obtained with a
+  better implementation allowing for training with more examples and
+  a higher-order non-linear projection.}  For training on nearly a hundred million examples (with the
+perturbed data), the MLPs and SDA are much more convenient than classifiers
+based on kernel methods.  The MLP has a single hidden layer with $\tanh$
+activation functions, and softmax (normalized exponentials) on the output
+layer for estimating $P({\rm class} | {\rm input})$.  The number of hidden units is
+taken in $\{300,500,800,1000,1500\}$.  Training examples are presented in
+minibatches of size 20, i.e., the parameters are iteratively updated in the direction
+of the mean gradient of the next 20 examples. A constant learning rate was chosen among $\{0.001,
+0.01, 0.025, 0.075, 0.1, 0.5\}$.
+%through preliminary experiments (measuring performance on a validation set),
+%and $0.1$ (which was found to work best) was then selected for optimizing on
+%the whole training sets.
+%\vspace*{-1mm}
+
+
+{\bf Stacked Denoising Auto-encoders (SDA).}
+Various auto-encoder variants and Restricted Boltzmann Machines (RBMs)
+can be used to initialize the weights of each layer of a deep MLP (with many hidden 
+layers)~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006}, 
+apparently setting parameters in the
+basin of attraction of supervised gradient descent yielding better 
+generalization~\citep{Erhan+al-2010}.  This initial {\em unsupervised
+pre-training phase} uses all of the training images but not the training labels.
+Each layer is trained in turn to produce a new representation of its input
+(starting from the raw pixels).
+It is hypothesized that the
+advantage brought by this procedure stems from a better prior,
+on the one hand taking advantage of the link between the input
+distribution $P(x)$ and the conditional distribution of interest
+$P(y|x)$ (like in semi-supervised learning), and on the other hand
+taking advantage of the expressive power and bias implicit in the
+deep architecture (whereby complex concepts are expressed as
+compositions of simpler ones through a deep hierarchy).
+
+Here we chose to use the Denoising
+Auto-encoder~\citep{VincentPLarochelleH2008-very-small} as the building block for
+these deep hierarchies of features, as it is simple to train and
+explain (see Figure~\ref{fig:da}, as well as 
+tutorial and code there: {\tt http://deeplearning.net/tutorial}), 
+provides efficient inference, and yielded results
+comparable or better than RBMs in series of experiments
+\citep{VincentPLarochelleH2008-very-small}. It really corresponds to a Gaussian
+RBM trained by a Score Matching criterion~\cite{Vincent-SM-2010}.
+During its unsupervised training, a Denoising
+Auto-encoder is presented with a stochastically corrupted version $\tilde{x}$
+of the input $x$ and trained to reconstruct to produce a reconstruction $z$ 
+of the uncorrupted input $x$. Because the network has to denoise, it is
+forcing the hidden units $y$ to represent the leading regularities in
+the data. Following~\citep{VincentPLarochelleH2008-very-small} 
+the hidden units output $y$ is obtained through 
+\[
+ y={\rm sigm}(c+V x)
+\]
+where ${\rm sigm}(a)=1/(1+\exp(-a))$
+and the reconstruction is 
+\[ 
+ z={\rm sigm}(d+V' y).
+\]
+We minimize the training
+set average of the cross-entropy
+reconstruction error 
+\[
+ L_H(x,z)=\sum_i z_i \log x_i + (1-z_i) \log(1-x_i).
+\]
+Here we use the random binary masking corruption
+(which in $\tilde{x}$ sets to 0 a random subset of the elements of $x$, and
+copies the rest).
+Once the first denoising auto-encoder is trained, its parameters can be used
+to set the first layer of the deep MLP. The original data are then processed
+through that first layer, and the output of the hidden units form a new
+representation that can be used as input data for training a second denoising
+auto-encoder, still in a purely unsupervised way.
+This is repeated for the desired number of hidden layers.
+After this unsupervised pre-training stage, the parameters
+are used to initialize a deep MLP (similar to the above, but
+with more layers), which is fine-tuned by
+the same standard procedure (stochastic gradient descent)
+used to train MLPs in general (see above).
+The top layer parameters of the deep MLP (the one which outputs the
+class probabilities and takes the top hidden layer as input) can
+be initialized at 0.
+The SDA hyper-parameters are the same as for the MLP, with the addition of the
+amount of corruption noise (we used the masking noise process, whereby a
+fixed proportion of the input values, randomly selected, are zeroed), and a
+separate learning rate for the unsupervised pre-training stage (selected
+from the same above set). The fraction of inputs corrupted was selected
+among $\{10\%, 20\%, 50\%\}$. Another hyper-parameter is the number
+of hidden layers but it was fixed to 3 for our experiments,
+based on previous work with
+SDAs on MNIST~\citep{VincentPLarochelleH2008-very-small}. 
+We also compared against 1 and against 2 hidden layers, in order
+to disantangle the effect of depth from the effect of unsupervised
+pre-training.
+The size of the hidden
+layers was kept constant across hidden layers, and the best results
+were obtained with the largest values that we could experiment
+with given our patience, with 1000 hidden units.
+
+%\vspace*{-1mm}
+
+\begin{figure*}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}}
+%\vspace*{-3mm}
+\caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained
+on NIST, 1 on NISTP, and 2 on P07. Left: overall results
+of all models, on NIST and NISTP test sets.
+Right: error rates on NIST test digits only, along with the previous results from 
+literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+respectively based on ART, nearest neighbors, MLPs, and SVMs.}
+\label{fig:error-rates-charts}
+%\vspace*{-2mm}
+\end{figure*}
+
+
+\begin{figure*}[ht]
+\vspace*{-3mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}}
+\vspace*{-3mm}
+\caption{Relative improvement in error rate due to out-of-distribution examples.
+Left: Improvement (or loss, when negative)
+induced by out-of-distribution examples (perturbed data). 
+Right: Improvement (or loss, when negative) induced by multi-task 
+learning (training on all classes and testing only on either digits,
+upper case, or lower-case). The deep learner (SDA) benefits more from
+out-of-distribution examples, compared to the shallow MLP.}
+\label{fig:improvements-charts}
+\vspace*{-2mm}
+\end{figure*}
+
+\vspace*{-2mm}
+\section{Experimental Results}
+\vspace*{-2mm}
+
+%%\vspace*{-1mm}
+%\subsection{SDA vs MLP vs Humans}
+%%\vspace*{-1mm}
+The models are either trained on NIST (MLP0 and SDA0), 
+NISTP (MLP1 and SDA1), or P07 (MLP2 and SDA2), and tested
+on either NIST, NISTP or P07 (regardless of the data set used for training),
+either on the 62-class task
+or on the 10-digits task. Training time (including about half
+for unsupervised pre-training, for DAs) on the larger
+datasets is around one day on a GPU (GTX 285).
+Figure~\ref{fig:error-rates-charts} summarizes the results obtained,
+comparing humans, the three MLPs (MLP0, MLP1, MLP2) and the three SDAs (SDA0, SDA1,
+SDA2), along with the previous results on the digits NIST special database
+19 test set from the literature, respectively based on ARTMAP neural
+networks ~\citep{Granger+al-2007}, fast nearest-neighbor search
+~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs
+~\citep{Milgram+al-2005}.%  More detailed and complete numerical results
+%(figures and tables, including standard errors on the error rates) can be
+%found in Appendix.
+The deep learner not only outperformed the shallow ones and
+previously published performance (in a statistically and qualitatively
+significant way) but when trained with perturbed data
+reaches human performance on both the 62-class task
+and the 10-class (digits) task. 
+17\% error (SDA1) or 18\% error (humans) may seem large but a large
+majority of the errors from humans and from SDA1 are from out-of-context
+confusions (e.g. a vertical bar can be a ``1'', an ``l'' or an ``L'', and a
+``c'' and a ``C'' are often indistinguishible).
+Regarding shallower networks pre-trained with unsupervised denoising
+auto-encders, we find that the NIST test error is 21\% with one hidden
+layer and 20\% with two hidden layers (vs 17\% in the same conditions
+with 3 hidden layers). Compare this with the 23\% error achieved
+by the MLP, i.e. a single hidden layer and no unsupervised pre-training.
+As found in previous work~\cite{Erhan+al-2010,Larochelle-jmlr-2009}, 
+these results show that both depth and
+unsupervised pre-training need to be combined in order to achieve
+the best results.
+
+
+In addition, as shown in the left of
+Figure~\ref{fig:improvements-charts}, the relative improvement in error
+rate brought by out-of-distribution examples is greater for the deep
+SDA, and these
+differences with the shallow MLP are statistically and qualitatively
+significant. 
+The left side of the figure shows the improvement to the clean
+NIST test set error brought by the use of out-of-distribution examples
+(i.e. the perturbed examples examples from NISTP or P07),
+over the models trained exclusively on NIST (respectively SDA0 and MLP0).
+Relative percent change is measured by taking
+$100 \% \times$ (original model's error / perturbed-data model's error - 1).
+The right side of
+Figure~\ref{fig:improvements-charts} shows the relative improvement
+brought by the use of a multi-task setting, in which the same model is
+trained for more classes than the target classes of interest (i.e. training
+with all 62 classes when the target classes are respectively the digits,
+lower-case, or upper-case characters). Again, whereas the gain from the
+multi-task setting is marginal or negative for the MLP, it is substantial
+for the SDA.  Note that to simplify these multi-task experiments, only the original
+NIST dataset is used. For example, the MLP-digits bar shows the relative
+percent improvement in MLP error rate on the NIST digits test set 
+as $100\% \times$ (single-task
+model's error / multi-task model's error - 1).  The single-task model is
+trained with only 10 outputs (one per digit), seeing only digit examples,
+whereas the multi-task model is trained with 62 outputs, with all 62
+character classes as examples.  Hence the hidden units are shared across
+all tasks.  For the multi-task model, the digit error rate is measured by
+comparing the correct digit class with the output class associated with the
+maximum conditional probability among only the digit classes outputs.  The
+setting is similar for the other two target classes (lower case characters
+and upper case characters). Note however that some types of perturbations
+(NISTP) help more than others (P07) when testing on the clean images.
+%%\vspace*{-1mm}
+%\subsection{Perturbed Training Data More Helpful for SDA}
+%%\vspace*{-1mm}
+
+%%\vspace*{-1mm}
+%\subsection{Multi-Task Learning Effects}
+%%\vspace*{-1mm}
+
+\iffalse
+As previously seen, the SDA is better able to benefit from the
+transformations applied to the data than the MLP. In this experiment we
+define three tasks: recognizing digits (knowing that the input is a digit),
+recognizing upper case characters (knowing that the input is one), and
+recognizing lower case characters (knowing that the input is one).  We
+consider the digit classification task as the target task and we want to
+evaluate whether training with the other tasks can help or hurt, and
+whether the effect is different for MLPs versus SDAs.  The goal is to find
+out if deep learning can benefit more (or less) from multiple related tasks
+(i.e. the multi-task setting) compared to a corresponding purely supervised
+shallow learner.
+
+We use a single hidden layer MLP with 1000 hidden units, and a SDA
+with 3 hidden layers (1000 hidden units per layer), pre-trained and
+fine-tuned on NIST.
+
+Our results show that the MLP benefits marginally from the multi-task setting
+in the case of digits (5\% relative improvement) but is actually hurt in the case
+of characters (respectively 3\% and 4\% worse for lower and upper class characters).
+On the other hand the SDA benefited from the multi-task setting, with relative
+error rate improvements of 27\%, 15\% and 13\% respectively for digits,
+lower and upper case characters, as shown in Table~\ref{tab:multi-task}.
+\fi
+
+
+\vspace*{-2mm}
+\section{Conclusions and Discussion}
+\vspace*{-2mm}
+
+We have found that out-of-distribution examples (multi-task learning
+and perturbed examples) are more beneficial
+to a deep learner than to a traditional shallow and purely
+supervised learner. More precisely, 
+the answers are positive for all the questions asked in the introduction.
+%\begin{itemize}
+
+$\bullet$ %\item 
+{\bf Do the good results previously obtained with deep architectures on the
+MNIST digits generalize to a much larger and richer (but similar)
+dataset, the NIST special database 19, with 62 classes and around 800k examples}?
+Yes, the SDA {\em systematically outperformed the MLP and all the previously
+published results on this dataset} (the ones that we are aware of), {\em in fact reaching human-level
+performance} at around 17\% error on the 62-class task and 1.4\% on the digits,
+and beating previously published results on the same data.
+
+$\bullet$ %\item 
+{\bf To what extent do out-of-distribution examples help deep learners,
+and do they help them more than shallow supervised ones}?
+We found that distorted training examples not only made the resulting
+classifier better on similarly perturbed images but also on
+the {\em original clean examples}, and more importantly and more novel,
+that deep architectures benefit more from such {\em out-of-distribution}
+examples. Shallow MLPs were helped by perturbed training examples when tested on perturbed input 
+images (65\% relative improvement on NISTP) 
+but only marginally helped (5\% relative improvement on all classes) 
+or even hurt (10\% relative loss on digits)
+with respect to clean examples. On the other hand, the deep SDAs
+were significantly boosted by these out-of-distribution examples.
+Similarly, whereas the improvement due to the multi-task setting was marginal or
+negative for the MLP (from +5.6\% to -3.6\% relative change), 
+it was quite significant for the SDA (from +13\% to +27\% relative change),
+which may be explained by the arguments below.
+Since out-of-distribution data
+(perturbed or from other related classes) is very common, this conclusion
+is of practical importance.
+%\end{itemize}
+
+In the original self-taught learning framework~\citep{RainaR2007}, the
+out-of-sample examples were used as a source of unsupervised data, and
+experiments showed its positive effects in a \emph{limited labeled data}
+scenario. However, many of the results by \citet{RainaR2007} (who used a
+shallow, sparse coding approach) suggest that the {\em relative gain of self-taught
+learning vs ordinary supervised learning} diminishes as the number of labeled examples increases.
+We note instead that, for deep
+architectures, our experiments show that such a positive effect is accomplished
+even in a scenario with a \emph{large number of labeled examples},
+i.e., here, the relative gain of self-taught learning and
+out-of-distribution examples is probably preserved
+in the asymptotic regime. However, note that in our perturbation experiments
+(but not in our multi-task experiments), 
+even the out-of-distribution examples are labeled, unlike in the
+earlier self-taught learning experiments~\citep{RainaR2007}.
+
+{\bf Why would deep learners benefit more from the self-taught learning 
+framework and out-of-distribution examples}?
+The key idea is that the lower layers of the predictor compute a hierarchy
+of features that can be shared across tasks or across variants of the
+input distribution. A theoretical analysis of generalization improvements
+due to sharing of intermediate features across tasks already points
+towards that explanation~\cite{baxter95a}.
+Intermediate features that can be used in different
+contexts can be estimated in a way that allows to share statistical 
+strength. Features extracted through many levels are more likely to
+be more abstract and more invariant to some of the factors of variation
+in the underlying distribution (as the experiments in~\citet{Goodfellow2009} suggest),
+increasing the likelihood that they would be useful for a larger array
+of tasks and input conditions.
+Therefore, we hypothesize that both depth and unsupervised
+pre-training play a part in explaining the advantages observed here, and future
+experiments could attempt at teasing apart these factors.
+And why would deep learners benefit from the self-taught learning
+scenarios even when the number of labeled examples is very large?
+We hypothesize that this is related to the hypotheses studied
+in~\citet{Erhan+al-2010}. In~\citet{Erhan+al-2010}
+it was found that online learning on a huge dataset did not make the
+advantage of the deep learning bias vanish, and a similar phenomenon
+may be happening here. We hypothesize that unsupervised pre-training
+of a deep hierarchy with out-of-distribution examples initializes the
+model in the basin of attraction of supervised gradient descent
+that corresponds to better generalization. Furthermore, such good
+basins of attraction are not discovered by pure supervised learning
+(with or without out-of-distribution examples) from random initialization, and more labeled examples
+does not allow the shallow or purely supervised models to discover
+the kind of better basins associated
+with deep learning and out-of-distribution examples.
+ 
+A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 
+can be executed on-line at the anonymous site {\tt http://deep.host22.com}.
+
+\iffalse
+\section*{Appendix I: Detailed Numerical Results}
+
+These tables correspond to Figures 2 and 3 and contain the raw error rates for each model and dataset considered.
+They also contain additional data such as test errors on P07 and standard errors.
+
+\begin{table}[ht]
+\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
+26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
+(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
+(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
+and using a validation set to select hyper-parameters and other training choices. 
+\{SDA,MLP\}0 are trained on NIST,
+\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
+The human error rate on digits is a lower bound because it does not count digits that were
+recognized as letters. For comparison, the results found in the literature
+on NIST digits classification using the same test set are included.}
+\label{tab:sda-vs-mlp-vs-humans}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
+Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
+SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
+SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
+SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
+MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
+MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
+MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
+\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
+\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
+\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
+\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Relative change in error rates due to the use of perturbed training data,
+either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
+A positive value indicates that training on the perturbed data helped for the
+given test set (the first 3 columns on the 62-class tasks and the last one is
+on the clean 10-class digits). Clearly, the deep learning models did benefit more
+from perturbed training data, even when testing on clean data, whereas the MLP
+trained on perturbed data performed worse on the clean digits and about the same
+on the clean characters. }
+\label{tab:perturbation-effect}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
+SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
+SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
+MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
+MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Test error rates and relative change in error rates due to the use of
+a multi-task setting, i.e., training on each task in isolation vs training
+for all three tasks together, for MLPs vs SDAs. The SDA benefits much
+more from the multi-task setting. All experiments on only on the
+unperturbed NIST data, using validation error for model selection.
+Relative improvement is 1 - single-task error / multi-task error.}
+\label{tab:multi-task}
+\begin{center}
+\begin{tabular}{|l|r|r|r|} \hline
+             & single-task  & multi-task  & relative \\ 
+             & setting      & setting     & improvement \\ \hline
+MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
+MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
+MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
+SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
+SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
+SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\fi
+
+%\afterpage{\clearpage}
+%\clearpage
+{
+%\bibliographystyle{spbasic}      % basic style, author-year citations
+\bibliographystyle{plainnat}
+\bibliography{strings,strings-short,strings-shorter,ift6266_ml,specials,aigaion-shorter}
+%\bibliographystyle{unsrtnat}
+%\bibliographystyle{apalike}
+}
+
+
+\end{document}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/aistats2011_revised.tex	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,840 @@
+%\documentclass[twoside,11pt]{article} % For LaTeX2e
+\documentclass{article} % For LaTeX2e
+\usepackage{aistats2e_2011}
+%\usepackage{times}
+\usepackage{wrapfig}
+\usepackage{amsthm}
+\usepackage{amsmath}
+\usepackage{bbm}
+\usepackage[utf8]{inputenc}
+\usepackage[psamsfonts]{amssymb}
+%\usepackage{algorithm,algorithmic} % not used after all
+\usepackage{graphicx,subfigure}
+\usepackage[numbers]{natbib}
+
+\addtolength{\textwidth}{10mm}
+\addtolength{\evensidemargin}{-5mm}
+\addtolength{\oddsidemargin}{-5mm}
+
+%\setlength\parindent{0mm}
+
+\begin{document}
+
+\twocolumn[
+\aistatstitle{Deep Learners Benefit More from Out-of-Distribution Examples}
+\runningtitle{Deep Learners for Out-of-Distribution Examples}
+\runningauthor{Bengio et. al.}
+\aistatsauthor{Anonymous Authors\\
+\vspace*{5mm}}]
+\iffalse
+Yoshua  Bengio \and
+Frédéric  Bastien \and
+Arnaud  Bergeron \and
+Nicolas  Boulanger-Lewandowski \and
+Thomas  Breuel \and
+Youssouf  Chherawala \and
+Moustapha  Cisse \and 
+Myriam  Côté \and 
+Dumitru  Erhan \and
+Jeremy  Eustache \and
+Xavier  Glorot \and 
+Xavier  Muller \and
+Sylvain  Pannetier Lebeuf \and
+Razvan  Pascanu \and 
+Salah  Rifai \and 
+Francois  Savard \and 
+Guillaume  Sicard 
+%}
+\fi
+%\aistatsaddress{Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+%\date{{\tt bengioy@iro.umontreal.ca}, Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+%\jmlrheading{}{2010}{}{10/2010}{XX/2011}{Yoshua Bengio et al}
+%\editor{}
+
+%\makeanontitle
+%\maketitle
+
+%{\bf Running title: Deep Self-Taught Learning}
+
+\vspace*{5mm}
+\begin{abstract}
+  Recent theoretical and empirical work in statistical machine learning has demonstrated the potential of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple levels of representation. The hypothesis evaluated here is that intermediate levels of representation, because they can be shared across tasks and examples from different but related distributions, can yield even more benefits. Comparative experiments were performed on a large-scale handwritten character recognition setting with 62 classes (upper case, lower case, digits), using both a multi-task setting and perturbed examples in order to obtain out-of-distribution examples. The results agree with the hypothesis, and show that a deep learner did {\em beat previously published results and reached human-level performance}.
+\end{abstract}
+%\vspace*{-3mm}
+
+%\begin{keywords}  
+%Deep learning, self-taught learning, out-of-distribution examples, handwritten character recognition, multi-task learning
+%\end{keywords}
+%\keywords{self-taught learning \and multi-task learning \and out-of-distribution examples \and deep learning \and handwriting recognition}
+
+
+
+\section{Introduction}
+%\vspace*{-1mm}
+
+{\bf Deep Learning} has emerged as a promising new area of research in
+statistical machine learning~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,VincentPLarochelleH2008-very-small,ranzato-08,TaylorHintonICML2009,Larochelle-jmlr-2009,Salakhutdinov+Hinton-2009,HonglakL2009,HonglakLNIPS2009,Jarrett-ICCV2009,Taylor-cvpr-2010}. See \citet{Bengio-2009} for a review.
+Learning algorithms for deep architectures are centered on the learning
+of useful representations of data, which are better suited to the task at hand,
+and are organized in a hierarchy with multiple levels.
+This is in part inspired by observations of the mammalian visual cortex, 
+which consists of a chain of processing elements, each of which is associated with a
+different representation of the raw visual input. In fact,
+it was found recently that the features learnt in deep architectures resemble
+those observed in the first two of these stages (in areas V1 and V2
+of visual cortex) \citep{HonglakL2008}, and that they become more and
+more invariant to factors of variation (such as camera movement) in
+higher layers~\citep{Goodfellow2009}.
+It has been hypothesized that learning a hierarchy of features increases the
+ease and practicality of developing representations that are at once
+tailored to specific tasks, yet are able to borrow statistical strength
+from other related tasks (e.g., modeling different kinds of objects). Finally, learning the
+feature representation can lead to higher-level (more abstract, more
+general) features that are more robust to unanticipated sources of
+variance extant in real data.
+
+Whereas a deep architecture can in principle be more powerful than a
+shallow one in terms of representation, depth appears to render the
+training problem more difficult in terms of optimization and local minima.
+It is also only recently that successful algorithms were proposed to
+overcome some of these difficulties.  All are based on unsupervised
+learning, often in an greedy layer-wise ``unsupervised pre-training''
+stage~\citep{Bengio-2009}.  
+The principle is that each layer starting from
+the bottom is trained to represent its input (the output of the previous
+layer). After this
+unsupervised initialization, the stack of layers can be
+converted into a deep supervised feedforward neural network and fine-tuned by
+stochastic gradient descent.
+One of these layer initialization techniques,
+applied here, is the Denoising
+Auto-encoder~(DA)~\citep{VincentPLarochelleH2008-very-small} (see
+Figure~\ref{fig:da}), which performed similarly or 
+better~\citep{VincentPLarochelleH2008-very-small} than previously
+proposed Restricted Boltzmann Machines (RBM)~\citep{Hinton06} 
+in terms of unsupervised extraction
+of a hierarchy of features useful for classification. Each layer is trained
+to denoise its input, creating a layer of features that can be used as
+input for the next layer, forming a Stacked Denoising Auto-encoder (SDA).
+Note that training a Denoising Auto-encoder
+can actually been seen as training a particular RBM by an inductive
+principle different from maximum likelihood~\citep{Vincent-SM-2010}, 
+namely by Score Matching~\citep{Hyvarinen-2005,HyvarinenA2008}. 
+
+Previous comparative experimental results with stacking of RBMs and DAs
+to build deep supervised predictors had shown that they could outperform
+shallow architectures in a variety of settings, especially
+when the data involves complex interactions between many factors of 
+variation~\citep{LarochelleH2007,Bengio-2009}. Other experiments have suggested
+that the unsupervised layer-wise pre-training acted as a useful
+prior~\citep{Erhan+al-2010} that allows one to initialize a deep
+neural network in a relatively much smaller region of parameter space, 
+corresponding to better generalization.
+
+To further the understanding of the reasons for the good performance
+observed with deep learners, we focus here on the following {\em hypothesis}:
+intermediate levels of representation, especially when there are
+more such levels, can be exploited to {\bf share
+statistical strength across different but related types of examples},
+such as examples coming from other tasks than the task of interest
+(the multi-task setting), or examples coming from an overlapping
+but different distribution (images with different kinds of perturbations
+and noises, here). This is consistent with the hypotheses discussed
+in~\citet{Bengio-2009} regarding the potential advantage
+of deep learning and the idea that more levels of representation can
+give rise to more abstract, more general features of the raw input.
+
+This hypothesis is related to a learning setting called
+{\bf self-taught learning}~\citep{RainaR2007}, which combines principles
+of semi-supervised and multi-task learning: the learner can exploit examples
+that are unlabeled and possibly come from a distribution different from the target
+distribution, e.g., from other classes than those of interest. 
+It has already been shown that deep learners can clearly take advantage of
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small},
+but more needed to be done to explore the impact
+of {\em out-of-distribution} examples and of the {\em multi-task} setting
+(one exception is~\citep{CollobertR2008}, which shares and uses unsupervised
+pre-training only with the first layer). In particular the {\em relative
+advantage of deep learning} for these settings has not been evaluated.
+
+
+%
+The {\bf main claim} of this paper is that deep learners (with several levels of representation) can
+{\bf benefit more from out-of-distribution examples than shallow learners} (with a single
+level), both in the context of the multi-task setting and from
+ perturbed examples. Because we are able to improve on state-of-the-art
+performance and reach human-level performance
+on a large-scale task, we consider that this paper is also a contribution
+to advance the application of machine learning to handwritten character recognition.
+More precisely, we ask and answer the following questions:
+
+%\begin{enumerate}
+$\bullet$ %\item 
+Do the good results previously obtained with deep architectures on the
+MNIST digit images generalize to the setting of a similar but much larger and richer
+dataset, the NIST special database 19, with 62 classes and around 800k examples?
+
+$\bullet$ %\item 
+To what extent does the perturbation of input images (e.g. adding
+noise, affine transformations, background images) make the resulting
+classifiers better not only on similarly perturbed images but also on
+the {\em original clean examples}? We study this question in the
+context of the 62-class and 10-class tasks of the NIST special database 19.
+
+$\bullet$ %\item 
+Do deep architectures {\em benefit {\bf more} from such out-of-distribution}
+examples, in particular do they benefit more from 
+examples that are perturbed versions of the examples from the task of interest?
+
+$\bullet$ %\item 
+Similarly, does the feature learning step in deep learning algorithms benefit {\bf more}
+from training with moderately {\em different classes} (i.e. a multi-task learning scenario) than
+a corresponding shallow and purely supervised architecture?
+We train on 62 classes and test on 10 (digits) or 26 (upper case or lower case)
+to answer this question.
+%\end{enumerate}
+
+Our experimental results provide positive evidence towards all of these questions,
+as well as {\bf classifiers that reach human-level performance on 62-class isolated character
+recognition and beat previously published results on the NIST dataset (special database 19)}.
+To achieve these results, we introduce in the next section a sophisticated system
+for stochastically transforming character images and then explain the methodology,
+which is based on training with or without these transformed images and testing on 
+clean ones. 
+Code for generating these transformations as well as for the deep learning 
+algorithms are made available at {\tt http://anonymous.url.net}.%{\tt http://hg.assembla.com/ift6266}.
+
+%\vspace*{-3mm}
+%\newpage
+\section{Perturbed and Transformed Character Images}
+\label{s:perturbations}
+%\vspace*{-2mm}
+
+Figure~\ref{fig:transform} shows the different transformations we used to stochastically
+transform $32 \times 32$ source images (such as the one in Fig.\ref{fig:torig})
+in order to obtain data from a larger distribution which
+covers a domain substantially larger than the clean characters distribution from
+which we start.
+Although character transformations have been used before to
+improve character recognizers, this effort is on a large scale both
+in number of classes and in the complexity of the transformations, hence
+in the complexity of the learning task.
+The code for these transformations (mostly Python) is available at 
+{\tt http://anonymous.url.net}. All the modules in the pipeline (Figure~\ref{fig:transform}) share
+a global control parameter ($0 \le complexity \le 1$) that allows one to modulate the
+amount of deformation or noise introduced. 
+There are two main parts in the pipeline. The first one,
+from thickness to pinch, performs transformations. The second
+part, from blur to contrast, adds different kinds of noise.
+More details can be found in~\citep{ift6266-tr-anonymous}.
+
+\begin{figure*}[ht]
+\centering
+\subfigure[Original]{\includegraphics[scale=0.6]{images/Original.png}\label{fig:torig}}
+\subfigure[Thickness]{\includegraphics[scale=0.6]{images/Thick_only.png}}
+\subfigure[Slant]{\includegraphics[scale=0.6]{images/Slant_only.png}}
+\subfigure[Affine Transformation]{\includegraphics[scale=0.6]{images/Affine_only.png}}
+\subfigure[Local Elastic Deformation]{\includegraphics[scale=0.6]{images/Localelasticdistorsions_only.png}}
+\subfigure[Pinch]{\includegraphics[scale=0.6]{images/Pinch_only.png}}
+%Noise
+\subfigure[Motion Blur]{\includegraphics[scale=0.6]{images/Motionblur_only.png}}
+\subfigure[Occlusion]{\includegraphics[scale=0.6]{images/occlusion_only.png}}
+\subfigure[Gaussian Smoothing]{\includegraphics[scale=0.6]{images/Bruitgauss_only.png}}
+\subfigure[Pixels Permutation]{\includegraphics[scale=0.6]{images/Permutpixel_only.png}}
+\subfigure[Gaussian Noise]{\includegraphics[scale=0.6]{images/Distorsiongauss_only.png}}
+\subfigure[Background Image Addition]{\includegraphics[scale=0.6]{images/background_other_only.png}}
+\subfigure[Salt \& Pepper]{\includegraphics[scale=0.6]{images/Poivresel_only.png}}
+\subfigure[Scratches]{\includegraphics[scale=0.6]{images/Rature_only.png}}
+\subfigure[Grey Level \& Contrast]{\includegraphics[scale=0.6]{images/Contrast_only.png}}
+\caption{Top left (a): example original image. Others (b-o): examples of the effect
+of each transformation module taken separately. Actual perturbed examples are obtained by
+a pipeline of these, with random choices about which module to apply and how much perturbation
+to apply.}
+\label{fig:transform}
+%\vspace*{-2mm}
+\end{figure*}
+
+%\vspace*{-3mm}
+\section{Experimental Setup}
+%\vspace*{-1mm}
+
+Much previous work on deep learning had been performed on
+the MNIST digits task~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,Salakhutdinov+Hinton-2009},
+with 60,000 examples, and variants involving 10,000
+examples~\citep{Larochelle-jmlr-2009,VincentPLarochelleH2008-very-small}.
+The focus here is on much larger training sets, from 10 times to 
+to 1000 times larger, and 62 classes.
+
+The first step in constructing the larger datasets (called NISTP and P07) is to sample from
+a {\em data source}: {\bf NIST} (NIST database 19), {\bf Fonts}, {\bf Captchas},
+and {\bf OCR data} (scanned machine printed characters). See more in 
+Section~\ref{sec:sources} below. Once a character
+is sampled from one of these sources (chosen randomly), the second step is to
+apply a pipeline of transformations and/or noise processes outlined in section \ref{s:perturbations}.
+
+To provide a baseline of error rate comparison we also estimate human performance
+on both the 62-class task and the 10-class digits task.
+We compare the best Multi-Layer Perceptrons (MLP) against
+the best Stacked Denoising Auto-encoders (SDA), when
+both models' hyper-parameters are selected to minimize the validation set error.
+We also provide a comparison against a precise estimate
+of human performance obtained via Amazon's Mechanical Turk (AMT)
+service ({\tt http://mturk.com}). 
+AMT users are paid small amounts
+of money to perform tasks for which human intelligence is required.
+Mechanical Turk has been used extensively in natural language processing and vision.
+%processing \citep{SnowEtAl2008} and vision
+%\citep{SorokinAndForsyth2008,whitehill09}. 
+AMT users were presented
+with 10 character images (from a test set) on a screen
+and asked to label them.
+They were forced to choose a single character class (either among the
+62 or 10 character classes) for each image.
+80 subjects classified 2500 images per (dataset,task) pair.
+Different humans labelers sometimes provided a different label for the same
+example, and we were able to estimate the error variance due to this effect
+because each image was classified by 3 different persons. 
+The average error of humans on the 62-class task NIST test set
+is 18.2\%, with a standard error of 0.1\%.
+We controlled noise in the labelling process by (1)
+requiring AMT workers with a higher than normal average of accepted
+responses ($>$95\%) on other tasks (2) discarding responses that were not
+complete (10 predictions) (3) discarding responses for which for which the
+time to predict was smaller than 3 seconds for NIST (the mean response time
+was 20 seconds) and 6 seconds seconds for NISTP (average response time of
+45 seconds) (4) discarding responses which were obviously wrong (10
+identical ones, or "12345..."). Overall, after such filtering, we kept
+approximately 95\% of the AMT workers' responses.
+
+%\vspace*{-3mm}
+\subsection{Data Sources}
+\label{sec:sources}
+%\vspace*{-2mm}
+
+%\begin{itemize}
+%\item 
+{\bf NIST.}
+Our main source of characters is the NIST Special Database 19~\citep{Grother-1995}, 
+widely used for training and testing character
+recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}. 
+The dataset is composed of 814255 digits and characters (upper and lower cases), with hand checked classifications,
+extracted from handwritten sample forms of 3600 writers. The characters are labelled by one of the 62 classes 
+corresponding to ``0''-``9'',``A''-``Z'' and ``a''-``z''. The dataset contains 8 parts (partitions) of varying complexity. 
+The fourth partition (called $hsf_4$, 82,587 examples), 
+experimentally recognized to be the most difficult one, is the one recommended 
+by NIST as a testing set and is used in our work as well as some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+for that purpose. We randomly split the remainder (731,668 examples) into a training set and a validation set for
+model selection. 
+The performances reported by previous work on that dataset mostly use only the digits.
+Here we use all the classes both in the training and testing phase. This is especially
+useful to estimate the effect of a multi-task setting.
+The distribution of the classes in the NIST training and test sets differs
+substantially, with relatively many more digits in the test set, and a more uniform distribution
+of letters in the test set (whereas in the training set they are distributed
+more like in natural text).
+%\vspace*{-1mm}
+
+%\item 
+{\bf Fonts.} 
+In order to have a good variety of sources we downloaded an important number of free fonts from:
+{\tt http://cg.scs.carleton.ca/\textasciitilde luc/freefonts.html}.
+% TODO: pointless to anonymize, it's not pointing to our work
+Including an operating system's (Windows 7) fonts, there is a total of $9817$ different fonts that we can choose uniformly from.
+The chosen {\tt ttf} file is either used as input of the Captcha generator (see next item) or, by producing a corresponding image, 
+directly as input to our models.
+%\vspace*{-1mm}
+
+%\item 
+{\bf Captchas.}
+The Captcha data source is an adaptation of the \emph{pycaptcha} library (a Python-based captcha generator library) for 
+generating characters of the same format as the NIST dataset. This software is based on
+a random character class generator and various kinds of transformations similar to those described in the previous sections. 
+In order to increase the variability of the data generated, many different fonts are used for generating the characters. 
+Transformations (slant, distortions, rotation, translation) are applied to each randomly generated character with a complexity
+depending on the value of the complexity parameter provided by the user of the data source. 
+%Two levels of complexity are allowed and can be controlled via an easy to use facade class. %TODO: what's a facade class?
+%\vspace*{-1mm}
+
+%\item 
+{\bf OCR data.}
+A large set (2 million) of scanned, OCRed and manually verified machine-printed 
+characters where included as an
+additional source. This set is part of a larger corpus being collected by the Image Understanding
+Pattern Recognition Research group led by Thomas Breuel at University of Kaiserslautern 
+({\tt http://www.iupr.com}), and which will be publicly released.
+%TODO: let's hope that Thomas is not a reviewer! :) Seriously though, maybe we should anonymize this
+%\end{itemize}
+
+%\vspace*{-3mm}
+\subsection{Data Sets}
+%\vspace*{-2mm}
+
+All data sets contain 32$\times$32 grey-level images (values in $[0,1]$) associated with a label
+from one of the 62 character classes.
+%\begin{itemize}
+%\vspace*{-1mm}
+
+%\item 
+{\bf NIST.} This is the raw NIST special database 19~\citep{Grother-1995}. It has
+\{651,668 / 80,000 / 82,587\} \{training / validation / test\} examples.
+%\vspace*{-1mm}
+
+%\item 
+{\bf P07.} This dataset is obtained by taking raw characters from all four of the above sources
+and sending them through the transformation pipeline described in section \ref{s:perturbations}.
+For each new example to generate, a data source is selected with probability $10\%$ from the fonts,
+$25\%$ from the captchas, $25\%$ from the OCR data and $40\%$ from NIST. We apply all the transformations in the
+order given above, and for each of them we sample uniformly a \emph{complexity} in the range $[0,0.7]$.
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\vspace*{-1mm}
+
+%\item 
+{\bf NISTP.} This one is equivalent to P07 (complexity parameter of $0.7$ with the same proportions of data sources)
+  except that we only apply
+  transformations from slant to pinch (see Fig.\ref{fig:transform}(b-f)).
+  Therefore, the character is
+  transformed but no additional noise is added to the image, giving images
+  closer to the NIST dataset. 
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\end{itemize}
+
+\begin{figure*}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}}
+%\vspace*{-2mm}
+\caption{Illustration of the computations and training criterion for the denoising
+auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of
+the layer (i.e. raw input or output of previous layer)
+s corrupted into $\tilde{x}$ and encoded into code $y$ by the encoder $f_\theta(\cdot)$.
+The decoder $g_{\theta'}(\cdot)$ maps $y$ to reconstruction $z$, which
+is compared to the uncorrupted input $x$ through the loss function
+$L_H(x,z)$, whose expected value is approximately minimized during training
+by tuning $\theta$ and $\theta'$.}
+\label{fig:da}
+%\vspace*{-2mm}
+\end{figure*}
+
+%\vspace*{-3mm}
+\subsection{Models and their Hyper-parameters}
+%\vspace*{-2mm}
+
+The experiments are performed using MLPs (with a single
+hidden layer) and deep SDAs.
+\emph{Hyper-parameters are selected based on the {\bf NISTP} validation set error.}
+
+{\bf Multi-Layer Perceptrons (MLP).}  Whereas previous work had compared
+deep architectures to both shallow MLPs and SVMs, we only compared to MLPs
+here because of the very large datasets used (making the use of SVMs
+computationally challenging because of their quadratic scaling
+behavior). Preliminary experiments on training SVMs (libSVM) with subsets
+of the training set allowing the program to fit in memory yielded
+substantially worse results than those obtained with MLPs\footnote{RBF SVMs
+  trained with a subset of NISTP or NIST, 100k examples, to fit in memory,
+  yielded 64\% test error or worse; online linear SVMs trained on the whole
+  of NIST or 800k from NISTP yielded no better than 42\% error; slightly
+  better results were obtained by sparsifying the pixel intensities and
+  projecting to a second-order polynomial (a very sparse vector), still
+  41\% error. We expect that better results could be obtained with a
+  better implementation allowing for training with more examples and
+  a higher-order non-linear projection.}  For training on nearly a hundred million examples (with the
+perturbed data), the MLPs and SDA are much more convenient than classifiers
+based on kernel methods.  The MLP has a single hidden layer with $\tanh$
+activation functions, and softmax (normalized exponentials) on the output
+layer for estimating $P(class | image)$.  The number of hidden units is
+taken in $\{300,500,800,1000,1500\}$.  Training examples are presented in
+minibatches of size 20. A constant learning rate was chosen among $\{0.001,
+0.01, 0.025, 0.075, 0.1, 0.5\}$.
+%through preliminary experiments (measuring performance on a validation set),
+%and $0.1$ (which was found to work best) was then selected for optimizing on
+%the whole training sets.
+%\vspace*{-1mm}
+
+
+{\bf Stacked Denoising Auto-encoders (SDA).}
+Various auto-encoder variants and Restricted Boltzmann Machines (RBMs)
+can be used to initialize the weights of each layer of a deep MLP (with many hidden 
+layers)~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006}, 
+apparently setting parameters in the
+basin of attraction of supervised gradient descent yielding better 
+generalization~\citep{Erhan+al-2010}.  This initial {\em unsupervised
+pre-training phase} uses all of the training images but not the training labels.
+Each layer is trained in turn to produce a new representation of its input
+(starting from the raw pixels).
+It is hypothesized that the
+advantage brought by this procedure stems from a better prior,
+on the one hand taking advantage of the link between the input
+distribution $P(x)$ and the conditional distribution of interest
+$P(y|x)$ (like in semi-supervised learning), and on the other hand
+taking advantage of the expressive power and bias implicit in the
+deep architecture (whereby complex concepts are expressed as
+compositions of simpler ones through a deep hierarchy).
+
+Here we chose to use the Denoising
+Auto-encoder~\citep{VincentPLarochelleH2008-very-small} as the building block for
+these deep hierarchies of features, as it is simple to train and
+explain (see Figure~\ref{fig:da}, as well as 
+tutorial and code there: {\tt http://deeplearning.net/tutorial}), 
+provides efficient inference, and yielded results
+comparable or better than RBMs in series of experiments
+\citep{VincentPLarochelleH2008-very-small}. It really corresponds to a Gaussian
+RBM trained by a Score Matching criterion~\cite{Vincent-SM-2010}.
+During training, a Denoising
+Auto-encoder is presented with a stochastically corrupted version
+of the input and trained to reconstruct the uncorrupted input,
+forcing the hidden units to represent the leading regularities in
+the data. Here we use the random binary masking corruption
+(which sets to 0 a random subset of the inputs).
+ Once it is trained, in a purely unsupervised way, 
+its hidden units' activations can
+be used as inputs for training a second one, etc.
+After this unsupervised pre-training stage, the parameters
+are used to initialize a deep MLP, which is fine-tuned by
+the same standard procedure used to train them (see above).
+The SDA hyper-parameters are the same as for the MLP, with the addition of the
+amount of corruption noise (we used the masking noise process, whereby a
+fixed proportion of the input values, randomly selected, are zeroed), and a
+separate learning rate for the unsupervised pre-training stage (selected
+from the same above set). The fraction of inputs corrupted was selected
+among $\{10\%, 20\%, 50\%\}$. Another hyper-parameter is the number
+of hidden layers but it was fixed to 3 for most experiments,
+based on previous work with
+SDAs on MNIST~\citep{VincentPLarochelleH2008-very-small}. 
+We also compared against 1 and against 2 hidden layers, in order
+to disantangle the effect of depth from the effect of unsupervised
+pre-training.
+The size of the hidden
+layers was kept constant across hidden layers, and the best results
+were obtained with the largest values that we could experiment
+with given our patience, with 1000 hidden units.
+
+%\vspace*{-1mm}
+
+\begin{figure*}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}}
+%\vspace*{-3mm}
+\caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained
+on NIST, 1 on NISTP, and 2 on P07. Left: overall results
+of all models, on NIST and NISTP test sets.
+Right: error rates on NIST test digits only, along with the previous results from 
+literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+respectively based on ART, nearest neighbors, MLPs, and SVMs.}
+\label{fig:error-rates-charts}
+%\vspace*{-2mm}
+\end{figure*}
+
+
+\begin{figure*}[ht]
+\vspace*{-3mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}}
+\vspace*{-3mm}
+\caption{Relative improvement in error rate due to out-of-distribution examples.
+Left: Improvement (or loss, when negative)
+induced by out-of-distribution examples (perturbed data). 
+Right: Improvement (or loss, when negative) induced by multi-task 
+learning (training on all classes and testing only on either digits,
+upper case, or lower-case). The deep learner (SDA) benefits more from
+out-of-distribution examples, compared to the shallow MLP.}
+\label{fig:improvements-charts}
+\vspace*{-2mm}
+\end{figure*}
+
+\vspace*{-2mm}
+\section{Experimental Results}
+\vspace*{-2mm}
+
+%%\vspace*{-1mm}
+%\subsection{SDA vs MLP vs Humans}
+%%\vspace*{-1mm}
+The models are either trained on NIST (MLP0 and SDA0), 
+NISTP (MLP1 and SDA1), or P07 (MLP2 and SDA2), and tested
+on either NIST, NISTP or P07 (regardless of the data set used for training),
+either on the 62-class task
+or on the 10-digits task. Training time (including about half
+for unsupervised pre-training, for DAs) on the larger
+datasets is around one day on a GPU (GTX 285).
+Figure~\ref{fig:error-rates-charts} summarizes the results obtained,
+comparing humans, the three MLPs (MLP0, MLP1, MLP2) and the three SDAs (SDA0, SDA1,
+SDA2), along with the previous results on the digits NIST special database
+19 test set from the literature, respectively based on ARTMAP neural
+networks ~\citep{Granger+al-2007}, fast nearest-neighbor search
+~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs
+~\citep{Milgram+al-2005}.%  More detailed and complete numerical results
+%(figures and tables, including standard errors on the error rates) can be
+%found in Appendix.
+The deep learner not only outperformed the shallow ones and
+previously published performance (in a statistically and qualitatively
+significant way) but when trained with perturbed data
+reaches human performance on both the 62-class task
+and the 10-class (digits) task. 
+17\% error (SDA1) or 18\% error (humans) may seem large but a large
+majority of the errors from humans and from SDA1 are from out-of-context
+confusions (e.g. a vertical bar can be a ``1'', an ``l'' or an ``L'', and a
+``c'' and a ``C'' are often indistinguishible).
+Regarding shallower networks pre-trained with unsupervised denoising
+auto-encders, we find that the NIST test error is 21\% with one hidden
+layer and 20\% with two hidden layers (vs 17\% in the same conditions
+with 3 hidden layers). Compare this with the 23\% error achieved
+by the MLP, i.e. a single hidden layer and no unsupervised pre-training.
+As found in previous work~\cite{Erhan+al-2010,Larochelle-jmlr-2009}, 
+these results show that both depth and
+unsupervised pre-training need to be combined in order to achieve
+the best results.
+
+
+In addition, as shown in the left of
+Figure~\ref{fig:improvements-charts}, the relative improvement in error
+rate brought by out-of-distribution examples is greater for the deep
+SDA, and these
+differences with the shallow MLP are statistically and qualitatively
+significant. 
+The left side of the figure shows the improvement to the clean
+NIST test set error brought by the use of out-of-distribution examples
+(i.e. the perturbed examples examples from NISTP or P07),
+over the models trained exclusively on NIST (respectively SDA0 and MLP0).
+Relative percent change is measured by taking
+$100 \% \times$ (original model's error / perturbed-data model's error - 1).
+The right side of
+Figure~\ref{fig:improvements-charts} shows the relative improvement
+brought by the use of a multi-task setting, in which the same model is
+trained for more classes than the target classes of interest (i.e. training
+with all 62 classes when the target classes are respectively the digits,
+lower-case, or upper-case characters). Again, whereas the gain from the
+multi-task setting is marginal or negative for the MLP, it is substantial
+for the SDA.  Note that to simplify these multi-task experiments, only the original
+NIST dataset is used. For example, the MLP-digits bar shows the relative
+percent improvement in MLP error rate on the NIST digits test set 
+as $100\% \times$ (single-task
+model's error / multi-task model's error - 1).  The single-task model is
+trained with only 10 outputs (one per digit), seeing only digit examples,
+whereas the multi-task model is trained with 62 outputs, with all 62
+character classes as examples.  Hence the hidden units are shared across
+all tasks.  For the multi-task model, the digit error rate is measured by
+comparing the correct digit class with the output class associated with the
+maximum conditional probability among only the digit classes outputs.  The
+setting is similar for the other two target classes (lower case characters
+and upper case characters). Note however that some types of perturbations
+(NISTP) help more than others (P07) when testing on the clean images.
+%%\vspace*{-1mm}
+%\subsection{Perturbed Training Data More Helpful for SDA}
+%%\vspace*{-1mm}
+
+%%\vspace*{-1mm}
+%\subsection{Multi-Task Learning Effects}
+%%\vspace*{-1mm}
+
+\iffalse
+As previously seen, the SDA is better able to benefit from the
+transformations applied to the data than the MLP. In this experiment we
+define three tasks: recognizing digits (knowing that the input is a digit),
+recognizing upper case characters (knowing that the input is one), and
+recognizing lower case characters (knowing that the input is one).  We
+consider the digit classification task as the target task and we want to
+evaluate whether training with the other tasks can help or hurt, and
+whether the effect is different for MLPs versus SDAs.  The goal is to find
+out if deep learning can benefit more (or less) from multiple related tasks
+(i.e. the multi-task setting) compared to a corresponding purely supervised
+shallow learner.
+
+We use a single hidden layer MLP with 1000 hidden units, and a SDA
+with 3 hidden layers (1000 hidden units per layer), pre-trained and
+fine-tuned on NIST.
+
+Our results show that the MLP benefits marginally from the multi-task setting
+in the case of digits (5\% relative improvement) but is actually hurt in the case
+of characters (respectively 3\% and 4\% worse for lower and upper class characters).
+On the other hand the SDA benefited from the multi-task setting, with relative
+error rate improvements of 27\%, 15\% and 13\% respectively for digits,
+lower and upper case characters, as shown in Table~\ref{tab:multi-task}.
+\fi
+
+
+\vspace*{-2mm}
+\section{Conclusions and Discussion}
+\vspace*{-2mm}
+
+We have found that out-of-distribution examples (multi-task learning
+and perturbed examples) are more beneficial
+to a deep learner than to a traditional shallow and purely
+supervised learner. More precisely, 
+the answers are positive for all the questions asked in the introduction.
+%\begin{itemize}
+
+$\bullet$ %\item 
+{\bf Do the good results previously obtained with deep architectures on the
+MNIST digits generalize to a much larger and richer (but similar)
+dataset, the NIST special database 19, with 62 classes and around 800k examples}?
+Yes, the SDA {\em systematically outperformed the MLP and all the previously
+published results on this dataset} (the ones that we are aware of), {\em in fact reaching human-level
+performance} at around 17\% error on the 62-class task and 1.4\% on the digits,
+and beating previously published results on the same data.
+
+$\bullet$ %\item 
+{\bf To what extent do out-of-distribution examples help deep learners,
+and do they help them more than shallow supervised ones}?
+We found that distorted training examples not only made the resulting
+classifier better on similarly perturbed images but also on
+the {\em original clean examples}, and more importantly and more novel,
+that deep architectures benefit more from such {\em out-of-distribution}
+examples. Shallow MLPs were helped by perturbed training examples when tested on perturbed input 
+images (65\% relative improvement on NISTP) 
+but only marginally helped (5\% relative improvement on all classes) 
+or even hurt (10\% relative loss on digits)
+with respect to clean examples. On the other hand, the deep SDAs
+were significantly boosted by these out-of-distribution examples.
+Similarly, whereas the improvement due to the multi-task setting was marginal or
+negative for the MLP (from +5.6\% to -3.6\% relative change), 
+it was quite significant for the SDA (from +13\% to +27\% relative change),
+which may be explained by the arguments below.
+Since out-of-distribution data
+(perturbed or from other related classes) is very common, this conclusion
+is of practical importance.
+%\end{itemize}
+
+In the original self-taught learning framework~\citep{RainaR2007}, the
+out-of-sample examples were used as a source of unsupervised data, and
+experiments showed its positive effects in a \emph{limited labeled data}
+scenario. However, many of the results by \citet{RainaR2007} (who used a
+shallow, sparse coding approach) suggest that the {\em relative gain of self-taught
+learning vs ordinary supervised learning} diminishes as the number of labeled examples increases.
+We note instead that, for deep
+architectures, our experiments show that such a positive effect is accomplished
+even in a scenario with a \emph{large number of labeled examples},
+i.e., here, the relative gain of self-taught learning and
+out-of-distribution examples is probably preserved
+in the asymptotic regime. However, note that in our perturbation experiments
+(but not in our multi-task experiments), 
+even the out-of-distribution examples are labeled, unlike in the
+earlier self-taught learning experiments~\citep{RainaR2007}.
+
+{\bf Why would deep learners benefit more from the self-taught learning 
+framework and out-of-distribution examples}?
+The key idea is that the lower layers of the predictor compute a hierarchy
+of features that can be shared across tasks or across variants of the
+input distribution. A theoretical analysis of generalization improvements
+due to sharing of intermediate features across tasks already points
+towards that explanation~\cite{baxter95a}.
+Intermediate features that can be used in different
+contexts can be estimated in a way that allows to share statistical 
+strength. Features extracted through many levels are more likely to
+be more abstract and more invariant to some of the factors of variation
+in the underlying distribution (as the experiments in~\citet{Goodfellow2009} suggest),
+increasing the likelihood that they would be useful for a larger array
+of tasks and input conditions.
+Therefore, we hypothesize that both depth and unsupervised
+pre-training play a part in explaining the advantages observed here, and future
+experiments could attempt at teasing apart these factors.
+And why would deep learners benefit from the self-taught learning
+scenarios even when the number of labeled examples is very large?
+We hypothesize that this is related to the hypotheses studied
+in~\citet{Erhan+al-2010}. In~\citet{Erhan+al-2010}
+it was found that online learning on a huge dataset did not make the
+advantage of the deep learning bias vanish, and a similar phenomenon
+may be happening here. We hypothesize that unsupervised pre-training
+of a deep hierarchy with out-of-distribution examples initializes the
+model in the basin of attraction of supervised gradient descent
+that corresponds to better generalization. Furthermore, such good
+basins of attraction are not discovered by pure supervised learning
+(with or without out-of-distribution examples) from random initialization, and more labeled examples
+does not allow the shallow or purely supervised models to discover
+the kind of better basins associated
+with deep learning and out-of-distribution examples.
+ 
+A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 
+can be executed on-line at the anonymous site {\tt http://deep.host22.com}.
+
+\iffalse
+\section*{Appendix I: Detailed Numerical Results}
+
+These tables correspond to Figures 2 and 3 and contain the raw error rates for each model and dataset considered.
+They also contain additional data such as test errors on P07 and standard errors.
+
+\begin{table}[ht]
+\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
+26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
+(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
+(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
+and using a validation set to select hyper-parameters and other training choices. 
+\{SDA,MLP\}0 are trained on NIST,
+\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
+The human error rate on digits is a lower bound because it does not count digits that were
+recognized as letters. For comparison, the results found in the literature
+on NIST digits classification using the same test set are included.}
+\label{tab:sda-vs-mlp-vs-humans}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
+Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
+SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
+SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
+SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
+MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
+MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
+MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
+\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
+\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
+\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
+\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Relative change in error rates due to the use of perturbed training data,
+either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
+A positive value indicates that training on the perturbed data helped for the
+given test set (the first 3 columns on the 62-class tasks and the last one is
+on the clean 10-class digits). Clearly, the deep learning models did benefit more
+from perturbed training data, even when testing on clean data, whereas the MLP
+trained on perturbed data performed worse on the clean digits and about the same
+on the clean characters. }
+\label{tab:perturbation-effect}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
+SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
+SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
+MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
+MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Test error rates and relative change in error rates due to the use of
+a multi-task setting, i.e., training on each task in isolation vs training
+for all three tasks together, for MLPs vs SDAs. The SDA benefits much
+more from the multi-task setting. All experiments on only on the
+unperturbed NIST data, using validation error for model selection.
+Relative improvement is 1 - single-task error / multi-task error.}
+\label{tab:multi-task}
+\begin{center}
+\begin{tabular}{|l|r|r|r|} \hline
+             & single-task  & multi-task  & relative \\ 
+             & setting      & setting     & improvement \\ \hline
+MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
+MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
+MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
+SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
+SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
+SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\fi
+
+%\afterpage{\clearpage}
+%\clearpage
+{
+%\bibliographystyle{spbasic}      % basic style, author-year citations
+\bibliographystyle{plainnat}
+\bibliography{strings,strings-short,strings-shorter,ift6266_ml,specials,aigaion-shorter}
+%\bibliographystyle{unsrtnat}
+%\bibliographystyle{apalike}
+}
+
+
+\end{document}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/aistats2011_submission.tex	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,807 @@
+%\documentclass[twoside,11pt]{article} % For LaTeX2e
+\documentclass{article} % For LaTeX2e
+\usepackage{aistats2e_2011}
+%\usepackage{times}
+\usepackage{wrapfig}
+\usepackage{amsthm}
+\usepackage{amsmath}
+\usepackage{bbm}
+\usepackage[utf8]{inputenc}
+\usepackage[psamsfonts]{amssymb}
+%\usepackage{algorithm,algorithmic} % not used after all
+\usepackage{graphicx,subfigure}
+\usepackage[numbers]{natbib}
+
+\addtolength{\textwidth}{10mm}
+\addtolength{\evensidemargin}{-5mm}
+\addtolength{\oddsidemargin}{-5mm}
+
+%\setlength\parindent{0mm}
+
+\begin{document}
+
+\twocolumn[
+\aistatstitle{Deep Learners Benefit More from Out-of-Distribution Examples}
+\runningtitle{Deep Learners for Out-of-Distribution Examples}
+\runningauthor{Bengio et. al.}
+\aistatsauthor{Anonymous Authors}]
+\iffalse
+Yoshua  Bengio \and
+Frédéric  Bastien \and
+Arnaud  Bergeron \and
+Nicolas  Boulanger-Lewandowski \and
+Thomas  Breuel \and
+Youssouf  Chherawala \and
+Moustapha  Cisse \and 
+Myriam  Côté \and 
+Dumitru  Erhan \and
+Jeremy  Eustache \and
+Xavier  Glorot \and 
+Xavier  Muller \and
+Sylvain  Pannetier Lebeuf \and
+Razvan  Pascanu \and 
+Salah  Rifai \and 
+Francois  Savard \and 
+Guillaume  Sicard 
+%}
+\fi
+%\aistatsaddress{Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+%\date{{\tt bengioy@iro.umontreal.ca}, Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+%\jmlrheading{}{2010}{}{10/2010}{XX/2011}{Yoshua Bengio et al}
+%\editor{}
+
+%\makeanontitle
+%\maketitle
+
+%{\bf Running title: Deep Self-Taught Learning}
+
+%\vspace*{-2mm}
+\begin{abstract}
+  Recent theoretical and empirical work in statistical machine learning has demonstrated the potential of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple levels of representation. The hypothesis evaluated here is that intermediate levels of representation, because they can be shared across tasks and examples from different but related distributions, can yield even more benefits. Comparative experiments were performed on a large-scale handwritten character recognition setting with 62 classes (upper case, lower case, digits), using both a multi-task setting and perturbed examples in order to obtain out-of-distribution examples. The results agree with the hypothesis, and show that a deep learner did {\em beat previously published results and reached human-level performance}.
+\end{abstract}
+%\vspace*{-3mm}
+
+%\begin{keywords}  
+%Deep learning, self-taught learning, out-of-distribution examples, handwritten character recognition, multi-task learning
+%\end{keywords}
+%\keywords{self-taught learning \and multi-task learning \and out-of-distribution examples \and deep learning \and handwriting recognition}
+
+
+
+\section{Introduction}
+%\vspace*{-1mm}
+
+{\bf Deep Learning} has emerged as a promising new area of research in
+statistical machine learning~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,VincentPLarochelleH2008-very-small,ranzato-08,TaylorHintonICML2009,Larochelle-jmlr-2009,Salakhutdinov+Hinton-2009,HonglakL2009,HonglakLNIPS2009,Jarrett-ICCV2009,Taylor-cvpr-2010}. See \citet{Bengio-2009} for a review.
+Learning algorithms for deep architectures are centered on the learning
+of useful representations of data, which are better suited to the task at hand,
+and are organized in a hierarchy with multiple levels.
+This is in part inspired by observations of the mammalian visual cortex, 
+which consists of a chain of processing elements, each of which is associated with a
+different representation of the raw visual input. In fact,
+it was found recently that the features learnt in deep architectures resemble
+those observed in the first two of these stages (in areas V1 and V2
+of visual cortex) \citep{HonglakL2008}, and that they become more and
+more invariant to factors of variation (such as camera movement) in
+higher layers~\citep{Goodfellow2009}.
+It has been hypothesized that learning a hierarchy of features increases the
+ease and practicality of developing representations that are at once
+tailored to specific tasks, yet are able to borrow statistical strength
+from other related tasks (e.g., modeling different kinds of objects). Finally, learning the
+feature representation can lead to higher-level (more abstract, more
+general) features that are more robust to unanticipated sources of
+variance extant in real data.
+
+Whereas a deep architecture can in principle be more powerful than a
+shallow one in terms of representation, depth appears to render the
+training problem more difficult in terms of optimization and local minima.
+It is also only recently that successful algorithms were proposed to
+overcome some of these difficulties.  All are based on unsupervised
+learning, often in an greedy layer-wise ``unsupervised pre-training''
+stage~\citep{Bengio-2009}.  
+The principle is that each layer starting from
+the bottom is trained to represent its input (the output of the previous
+layer). After this
+unsupervised initialization, the stack of layers can be
+converted into a deep supervised feedforward neural network and fine-tuned by
+stochastic gradient descent.
+One of these layer initialization techniques,
+applied here, is the Denoising
+Auto-encoder~(DA)~\citep{VincentPLarochelleH2008-very-small} (see
+Figure~\ref{fig:da}), which performed similarly or 
+better~\citep{VincentPLarochelleH2008-very-small} than previously
+proposed Restricted Boltzmann Machines (RBM)~\citep{Hinton06} 
+in terms of unsupervised extraction
+of a hierarchy of features useful for classification. Each layer is trained
+to denoise its input, creating a layer of features that can be used as
+input for the next layer, forming a Stacked Denoising Auto-encoder (SDA).
+Note that training a Denoising Auto-encoder
+can actually been seen as training a particular RBM by an inductive
+principle different from maximum likelihood~\citep{Vincent-SM-2010}, 
+namely by Score Matching~\citep{Hyvarinen-2005,HyvarinenA2008}. 
+
+Previous comparative experimental results with stacking of RBMs and DAs
+to build deep supervised predictors had shown that they could outperform
+shallow architectures in a variety of settings, especially
+when the data involves complex interactions between many factors of 
+variation~\citep{LarochelleH2007,Bengio-2009}. Other experiments have suggested
+that the unsupervised layer-wise pre-training acted as a useful
+prior~\citep{Erhan+al-2010} that allows one to initialize a deep
+neural network in a relatively much smaller region of parameter space, 
+corresponding to better generalization.
+
+To further the understanding of the reasons for the good performance
+observed with deep learners, we focus here on the following {\em hypothesis}:
+intermediate levels of representation, especially when there are
+more such levels, can be exploited to {\bf share
+statistical strength across different but related types of examples},
+such as examples coming from other tasks than the task of interest
+(the multi-task setting), or examples coming from an overlapping
+but different distribution (images with different kinds of perturbations
+and noises, here). This is consistent with the hypotheses discussed
+in~\citet{Bengio-2009} regarding the potential advantage
+of deep learning and the idea that more levels of representation can
+give rise to more abstract, more general features of the raw input.
+
+This hypothesis is related to a learning setting called
+{\bf self-taught learning}~\citep{RainaR2007}, which combines principles
+of semi-supervised and multi-task learning: the learner can exploit examples
+that are unlabeled and possibly come from a distribution different from the target
+distribution, e.g., from other classes than those of interest. 
+It has already been shown that deep learners can clearly take advantage of
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small},
+but more needed to be done to explore the impact
+of {\em out-of-distribution} examples and of the {\em multi-task} setting
+(one exception is~\citep{CollobertR2008}, which shares and uses unsupervised
+pre-training only with the first layer). In particular the {\em relative
+advantage of deep learning} for these settings has not been evaluated.
+
+
+%
+The {\bf main claim} of this paper is that deep learners (with several levels of representation) can
+{\bf benefit more from out-of-distribution examples than shallow learners} (with a single
+level), both in the context of the multi-task setting and from
+ perturbed examples. Because we are able to improve on state-of-the-art
+performance and reach human-level performance
+on a large-scale task, we consider that this paper is also a contribution
+to advance the application of machine learning to handwritten character recognition.
+More precisely, we ask and answer the following questions:
+
+%\begin{enumerate}
+$\bullet$ %\item 
+Do the good results previously obtained with deep architectures on the
+MNIST digit images generalize to the setting of a similar but much larger and richer
+dataset, the NIST special database 19, with 62 classes and around 800k examples?
+
+$\bullet$ %\item 
+To what extent does the perturbation of input images (e.g. adding
+noise, affine transformations, background images) make the resulting
+classifiers better not only on similarly perturbed images but also on
+the {\em original clean examples}? We study this question in the
+context of the 62-class and 10-class tasks of the NIST special database 19.
+
+$\bullet$ %\item 
+Do deep architectures {\em benefit {\bf more} from such out-of-distribution}
+examples, in particular do they benefit more from 
+examples that are perturbed versions of the examples from the task of interest?
+
+$\bullet$ %\item 
+Similarly, does the feature learning step in deep learning algorithms benefit {\bf more}
+from training with moderately {\em different classes} (i.e. a multi-task learning scenario) than
+a corresponding shallow and purely supervised architecture?
+We train on 62 classes and test on 10 (digits) or 26 (upper case or lower case)
+to answer this question.
+%\end{enumerate}
+
+Our experimental results provide positive evidence towards all of these questions,
+as well as {\bf classifiers that reach human-level performance on 62-class isolated character
+recognition and beat previously published results on the NIST dataset (special database 19)}.
+To achieve these results, we introduce in the next section a sophisticated system
+for stochastically transforming character images and then explain the methodology,
+which is based on training with or without these transformed images and testing on 
+clean ones. 
+Code for generating these transformations as well as for the deep learning 
+algorithms are made available at {\tt http://anonymous.url.net}.%{\tt http://hg.assembla.com/ift6266}.
+
+%\vspace*{-3mm}
+%\newpage
+\section{Perturbed and Transformed Character Images}
+\label{s:perturbations}
+%\vspace*{-2mm}
+
+Figure~\ref{fig:transform} shows the different transformations we used to stochastically
+transform $32 \times 32$ source images (such as the one in Fig.\ref{fig:torig})
+in order to obtain data from a larger distribution which
+covers a domain substantially larger than the clean characters distribution from
+which we start.
+Although character transformations have been used before to
+improve character recognizers, this effort is on a large scale both
+in number of classes and in the complexity of the transformations, hence
+in the complexity of the learning task.
+The code for these transformations (mostly Python) is available at 
+{\tt http://anonymous.url.net}. All the modules in the pipeline (Figure~\ref{fig:transform}) share
+a global control parameter ($0 \le complexity \le 1$) that allows one to modulate the
+amount of deformation or noise introduced. 
+There are two main parts in the pipeline. The first one,
+from thickness to pinch, performs transformations. The second
+part, from blur to contrast, adds different kinds of noise.
+More details can be found in~\citep{ift6266-tr-anonymous}.
+
+\begin{figure*}[ht]
+\centering
+\subfigure[Original]{\includegraphics[scale=0.6]{images/Original.png}\label{fig:torig}}
+\subfigure[Thickness]{\includegraphics[scale=0.6]{images/Thick_only.png}}
+\subfigure[Slant]{\includegraphics[scale=0.6]{images/Slant_only.png}}
+\subfigure[Affine Transformation]{\includegraphics[scale=0.6]{images/Affine_only.png}}
+\subfigure[Local Elastic Deformation]{\includegraphics[scale=0.6]{images/Localelasticdistorsions_only.png}}
+\subfigure[Pinch]{\includegraphics[scale=0.6]{images/Pinch_only.png}}
+%Noise
+\subfigure[Motion Blur]{\includegraphics[scale=0.6]{images/Motionblur_only.png}}
+\subfigure[Occlusion]{\includegraphics[scale=0.6]{images/occlusion_only.png}}
+\subfigure[Gaussian Smoothing]{\includegraphics[scale=0.6]{images/Bruitgauss_only.png}}
+\subfigure[Pixels Permutation]{\includegraphics[scale=0.6]{images/Permutpixel_only.png}}
+\subfigure[Gaussian Noise]{\includegraphics[scale=0.6]{images/Distorsiongauss_only.png}}
+\subfigure[Background Image Addition]{\includegraphics[scale=0.6]{images/background_other_only.png}}
+\subfigure[Salt \& Pepper]{\includegraphics[scale=0.6]{images/Poivresel_only.png}}
+\subfigure[Scratches]{\includegraphics[scale=0.6]{images/Rature_only.png}}
+\subfigure[Grey Level \& Contrast]{\includegraphics[scale=0.6]{images/Contrast_only.png}}
+\caption{Top left (a): example original image. Others (b-o): examples of the effect
+of each transformation module taken separately. Actual perturbed examples are obtained by
+a pipeline of these, with random choices about which module to apply and how much perturbation
+to apply.}
+\label{fig:transform}
+%\vspace*{-2mm}
+\end{figure*}
+
+%\vspace*{-3mm}
+\section{Experimental Setup}
+%\vspace*{-1mm}
+
+Much previous work on deep learning had been performed on
+the MNIST digits task~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,Salakhutdinov+Hinton-2009},
+with 60,000 examples, and variants involving 10,000
+examples~\citep{Larochelle-jmlr-2009,VincentPLarochelleH2008-very-small}.
+The focus here is on much larger training sets, from 10 times to 
+to 1000 times larger, and 62 classes.
+
+The first step in constructing the larger datasets (called NISTP and P07) is to sample from
+a {\em data source}: {\bf NIST} (NIST database 19), {\bf Fonts}, {\bf Captchas},
+and {\bf OCR data} (scanned machine printed characters). See more in 
+Section~\ref{sec:sources} below. Once a character
+is sampled from one of these sources (chosen randomly), the second step is to
+apply a pipeline of transformations and/or noise processes outlined in section \ref{s:perturbations}.
+
+To provide a baseline of error rate comparison we also estimate human performance
+on both the 62-class task and the 10-class digits task.
+We compare the best Multi-Layer Perceptrons (MLP) against
+the best Stacked Denoising Auto-encoders (SDA), when
+both models' hyper-parameters are selected to minimize the validation set error.
+We also provide a comparison against a precise estimate
+of human performance obtained via Amazon's Mechanical Turk (AMT)
+service ({\tt http://mturk.com}). 
+AMT users are paid small amounts
+of money to perform tasks for which human intelligence is required.
+Mechanical Turk has been used extensively in natural language processing and vision.
+%processing \citep{SnowEtAl2008} and vision
+%\citep{SorokinAndForsyth2008,whitehill09}. 
+AMT users were presented
+with 10 character images (from a test set) on a screen
+and asked to label them.
+They were forced to choose a single character class (either among the
+62 or 10 character classes) for each image.
+80 subjects classified 2500 images per (dataset,task) pair.
+Different humans labelers sometimes provided a different label for the same
+example, and we were able to estimate the error variance due to this effect
+because each image was classified by 3 different persons. 
+The average error of humans on the 62-class task NIST test set
+is 18.2\%, with a standard error of 0.1\%.
+
+%\vspace*{-3mm}
+\subsection{Data Sources}
+\label{sec:sources}
+%\vspace*{-2mm}
+
+%\begin{itemize}
+%\item 
+{\bf NIST.}
+Our main source of characters is the NIST Special Database 19~\citep{Grother-1995}, 
+widely used for training and testing character
+recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}. 
+The dataset is composed of 814255 digits and characters (upper and lower cases), with hand checked classifications,
+extracted from handwritten sample forms of 3600 writers. The characters are labelled by one of the 62 classes 
+corresponding to ``0''-``9'',``A''-``Z'' and ``a''-``z''. The dataset contains 8 parts (partitions) of varying complexity. 
+The fourth partition (called $hsf_4$, 82,587 examples), 
+experimentally recognized to be the most difficult one, is the one recommended 
+by NIST as a testing set and is used in our work as well as some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+for that purpose. We randomly split the remainder (731,668 examples) into a training set and a validation set for
+model selection. 
+The performances reported by previous work on that dataset mostly use only the digits.
+Here we use all the classes both in the training and testing phase. This is especially
+useful to estimate the effect of a multi-task setting.
+The distribution of the classes in the NIST training and test sets differs
+substantially, with relatively many more digits in the test set, and a more uniform distribution
+of letters in the test set (whereas in the training set they are distributed
+more like in natural text).
+%\vspace*{-1mm}
+
+%\item 
+{\bf Fonts.} 
+In order to have a good variety of sources we downloaded an important number of free fonts from:
+{\tt http://cg.scs.carleton.ca/\textasciitilde luc/freefonts.html}.
+% TODO: pointless to anonymize, it's not pointing to our work
+Including an operating system's (Windows 7) fonts, there is a total of $9817$ different fonts that we can choose uniformly from.
+The chosen {\tt ttf} file is either used as input of the Captcha generator (see next item) or, by producing a corresponding image, 
+directly as input to our models.
+%\vspace*{-1mm}
+
+%\item 
+{\bf Captchas.}
+The Captcha data source is an adaptation of the \emph{pycaptcha} library (a Python-based captcha generator library) for 
+generating characters of the same format as the NIST dataset. This software is based on
+a random character class generator and various kinds of transformations similar to those described in the previous sections. 
+In order to increase the variability of the data generated, many different fonts are used for generating the characters. 
+Transformations (slant, distortions, rotation, translation) are applied to each randomly generated character with a complexity
+depending on the value of the complexity parameter provided by the user of the data source. 
+%Two levels of complexity are allowed and can be controlled via an easy to use facade class. %TODO: what's a facade class?
+%\vspace*{-1mm}
+
+%\item 
+{\bf OCR data.}
+A large set (2 million) of scanned, OCRed and manually verified machine-printed 
+characters where included as an
+additional source. This set is part of a larger corpus being collected by the Image Understanding
+Pattern Recognition Research group led by Thomas Breuel at University of Kaiserslautern 
+({\tt http://www.iupr.com}), and which will be publicly released.
+%TODO: let's hope that Thomas is not a reviewer! :) Seriously though, maybe we should anonymize this
+%\end{itemize}
+
+%\vspace*{-3mm}
+\subsection{Data Sets}
+%\vspace*{-2mm}
+
+All data sets contain 32$\times$32 grey-level images (values in $[0,1]$) associated with a label
+from one of the 62 character classes.
+%\begin{itemize}
+%\vspace*{-1mm}
+
+%\item 
+{\bf NIST.} This is the raw NIST special database 19~\citep{Grother-1995}. It has
+\{651,668 / 80,000 / 82,587\} \{training / validation / test\} examples.
+%\vspace*{-1mm}
+
+%\item 
+{\bf P07.} This dataset is obtained by taking raw characters from all four of the above sources
+and sending them through the transformation pipeline described in section \ref{s:perturbations}.
+For each new example to generate, a data source is selected with probability $10\%$ from the fonts,
+$25\%$ from the captchas, $25\%$ from the OCR data and $40\%$ from NIST. We apply all the transformations in the
+order given above, and for each of them we sample uniformly a \emph{complexity} in the range $[0,0.7]$.
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\vspace*{-1mm}
+
+%\item 
+{\bf NISTP.} This one is equivalent to P07 (complexity parameter of $0.7$ with the same proportions of data sources)
+  except that we only apply
+  transformations from slant to pinch (see Fig.\ref{fig:transform}(b-f)).
+  Therefore, the character is
+  transformed but no additional noise is added to the image, giving images
+  closer to the NIST dataset. 
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\end{itemize}
+
+\begin{figure*}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}}
+%\vspace*{-2mm}
+\caption{Illustration of the computations and training criterion for the denoising
+auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of
+the layer (i.e. raw input or output of previous layer)
+s corrupted into $\tilde{x}$ and encoded into code $y$ by the encoder $f_\theta(\cdot)$.
+The decoder $g_{\theta'}(\cdot)$ maps $y$ to reconstruction $z$, which
+is compared to the uncorrupted input $x$ through the loss function
+$L_H(x,z)$, whose expected value is approximately minimized during training
+by tuning $\theta$ and $\theta'$.}
+\label{fig:da}
+%\vspace*{-2mm}
+\end{figure*}
+
+%\vspace*{-3mm}
+\subsection{Models and their Hyper-parameters}
+%\vspace*{-2mm}
+
+The experiments are performed using MLPs (with a single
+hidden layer) and deep SDAs.
+\emph{Hyper-parameters are selected based on the {\bf NISTP} validation set error.}
+
+{\bf Multi-Layer Perceptrons (MLP).}
+Whereas previous work had compared deep architectures to both shallow MLPs and
+SVMs, we only compared to MLPs here because of the very large datasets used
+(making the use of SVMs computationally challenging because of their quadratic
+scaling behavior). Preliminary experiments on training SVMs (libSVM) with subsets of the training
+set allowing the program to fit in memory yielded substantially worse results
+than those obtained with MLPs. For training on nearly a hundred million examples
+(with the perturbed data), the MLPs and SDA are much more convenient than
+classifiers based on kernel methods.
+The MLP has a single hidden layer with $\tanh$ activation functions, and softmax (normalized
+exponentials) on the output layer for estimating $P(class | image)$.
+The number of hidden units is taken in $\{300,500,800,1000,1500\}$. 
+Training examples are presented in minibatches of size 20. A constant learning
+rate was chosen among $\{0.001, 0.01, 0.025, 0.075, 0.1, 0.5\}$.
+%through preliminary experiments (measuring performance on a validation set),
+%and $0.1$ (which was found to work best) was then selected for optimizing on
+%the whole training sets.
+%\vspace*{-1mm}
+
+
+{\bf Stacked Denoising Auto-encoders (SDA).}
+Various auto-encoder variants and Restricted Boltzmann Machines (RBMs)
+can be used to initialize the weights of each layer of a deep MLP (with many hidden 
+layers)~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006}, 
+apparently setting parameters in the
+basin of attraction of supervised gradient descent yielding better 
+generalization~\citep{Erhan+al-2010}.  This initial {\em unsupervised
+pre-training phase} uses all of the training images but not the training labels.
+Each layer is trained in turn to produce a new representation of its input
+(starting from the raw pixels).
+It is hypothesized that the
+advantage brought by this procedure stems from a better prior,
+on the one hand taking advantage of the link between the input
+distribution $P(x)$ and the conditional distribution of interest
+$P(y|x)$ (like in semi-supervised learning), and on the other hand
+taking advantage of the expressive power and bias implicit in the
+deep architecture (whereby complex concepts are expressed as
+compositions of simpler ones through a deep hierarchy).
+
+Here we chose to use the Denoising
+Auto-encoder~\citep{VincentPLarochelleH2008-very-small} as the building block for
+these deep hierarchies of features, as it is simple to train and
+explain (see Figure~\ref{fig:da}, as well as 
+tutorial and code there: {\tt http://deeplearning.net/tutorial}), 
+provides efficient inference, and yielded results
+comparable or better than RBMs in series of experiments
+\citep{VincentPLarochelleH2008-very-small}. It really corresponds to a Gaussian
+RBM trained by a Score Matching criterion~\cite{Vincent-SM-2010}.
+During training, a Denoising
+Auto-encoder is presented with a stochastically corrupted version
+of the input and trained to reconstruct the uncorrupted input,
+forcing the hidden units to represent the leading regularities in
+the data. Here we use the random binary masking corruption
+(which sets to 0 a random subset of the inputs).
+ Once it is trained, in a purely unsupervised way, 
+its hidden units' activations can
+be used as inputs for training a second one, etc.
+After this unsupervised pre-training stage, the parameters
+are used to initialize a deep MLP, which is fine-tuned by
+the same standard procedure used to train them (see above).
+The SDA hyper-parameters are the same as for the MLP, with the addition of the
+amount of corruption noise (we used the masking noise process, whereby a
+fixed proportion of the input values, randomly selected, are zeroed), and a
+separate learning rate for the unsupervised pre-training stage (selected
+from the same above set). The fraction of inputs corrupted was selected
+among $\{10\%, 20\%, 50\%\}$. Another hyper-parameter is the number
+of hidden layers but it was fixed to 3 based on previous work with
+SDAs on MNIST~\citep{VincentPLarochelleH2008-very-small}. The size of the hidden
+layers was kept constant across hidden layers, and the best results
+were obtained with the largest values that we could experiment
+with given our patience, with 1000 hidden units.
+
+%\vspace*{-1mm}
+
+\begin{figure*}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}}
+%\vspace*{-3mm}
+\caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained
+on NIST, 1 on NISTP, and 2 on P07. Left: overall results
+of all models, on NIST and NISTP test sets.
+Right: error rates on NIST test digits only, along with the previous results from 
+literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+respectively based on ART, nearest neighbors, MLPs, and SVMs.}
+\label{fig:error-rates-charts}
+%\vspace*{-2mm}
+\end{figure*}
+
+
+\begin{figure*}[ht]
+\vspace*{-3mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}}
+\vspace*{-3mm}
+\caption{Relative improvement in error rate due to out-of-distribution examples.
+Left: Improvement (or loss, when negative)
+induced by out-of-distribution examples (perturbed data). 
+Right: Improvement (or loss, when negative) induced by multi-task 
+learning (training on all classes and testing only on either digits,
+upper case, or lower-case). The deep learner (SDA) benefits more from
+out-of-distribution examples, compared to the shallow MLP.}
+\label{fig:improvements-charts}
+\vspace*{-2mm}
+\end{figure*}
+
+\vspace*{-2mm}
+\section{Experimental Results}
+\vspace*{-2mm}
+
+%%\vspace*{-1mm}
+%\subsection{SDA vs MLP vs Humans}
+%%\vspace*{-1mm}
+The models are either trained on NIST (MLP0 and SDA0), 
+NISTP (MLP1 and SDA1), or P07 (MLP2 and SDA2), and tested
+on either NIST, NISTP or P07 (regardless of the data set used for training),
+either on the 62-class task
+or on the 10-digits task. Training time (including about half
+for unsupervised pre-training, for DAs) on the larger
+datasets is around one day on a GPU (GTX 285).
+Figure~\ref{fig:error-rates-charts} summarizes the results obtained,
+comparing humans, the three MLPs (MLP0, MLP1, MLP2) and the three SDAs (SDA0, SDA1,
+SDA2), along with the previous results on the digits NIST special database
+19 test set from the literature, respectively based on ARTMAP neural
+networks ~\citep{Granger+al-2007}, fast nearest-neighbor search
+~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs
+~\citep{Milgram+al-2005}.%  More detailed and complete numerical results
+%(figures and tables, including standard errors on the error rates) can be
+%found in Appendix.
+The deep learner not only outperformed the shallow ones and
+previously published performance (in a statistically and qualitatively
+significant way) but when trained with perturbed data
+reaches human performance on both the 62-class task
+and the 10-class (digits) task. 
+17\% error (SDA1) or 18\% error (humans) may seem large but a large
+majority of the errors from humans and from SDA1 are from out-of-context
+confusions (e.g. a vertical bar can be a ``1'', an ``l'' or an ``L'', and a
+``c'' and a ``C'' are often indistinguishible).
+
+In addition, as shown in the left of
+Figure~\ref{fig:improvements-charts}, the relative improvement in error
+rate brought by out-of-distribution examples is greater for the deep
+SDA, and these
+differences with the shallow MLP are statistically and qualitatively
+significant. 
+The left side of the figure shows the improvement to the clean
+NIST test set error brought by the use of out-of-distribution examples
+(i.e. the perturbed examples examples from NISTP or P07),
+over the models trained exclusively on NIST (respectively SDA0 and MLP0).
+Relative percent change is measured by taking
+$100 \% \times$ (original model's error / perturbed-data model's error - 1).
+The right side of
+Figure~\ref{fig:improvements-charts} shows the relative improvement
+brought by the use of a multi-task setting, in which the same model is
+trained for more classes than the target classes of interest (i.e. training
+with all 62 classes when the target classes are respectively the digits,
+lower-case, or upper-case characters). Again, whereas the gain from the
+multi-task setting is marginal or negative for the MLP, it is substantial
+for the SDA.  Note that to simplify these multi-task experiments, only the original
+NIST dataset is used. For example, the MLP-digits bar shows the relative
+percent improvement in MLP error rate on the NIST digits test set 
+as $100\% \times$ (single-task
+model's error / multi-task model's error - 1).  The single-task model is
+trained with only 10 outputs (one per digit), seeing only digit examples,
+whereas the multi-task model is trained with 62 outputs, with all 62
+character classes as examples.  Hence the hidden units are shared across
+all tasks.  For the multi-task model, the digit error rate is measured by
+comparing the correct digit class with the output class associated with the
+maximum conditional probability among only the digit classes outputs.  The
+setting is similar for the other two target classes (lower case characters
+and upper case characters). Note however that some types of perturbations
+(NISTP) help more than others (P07) when testing on the clean images.
+%%\vspace*{-1mm}
+%\subsection{Perturbed Training Data More Helpful for SDA}
+%%\vspace*{-1mm}
+
+%%\vspace*{-1mm}
+%\subsection{Multi-Task Learning Effects}
+%%\vspace*{-1mm}
+
+\iffalse
+As previously seen, the SDA is better able to benefit from the
+transformations applied to the data than the MLP. In this experiment we
+define three tasks: recognizing digits (knowing that the input is a digit),
+recognizing upper case characters (knowing that the input is one), and
+recognizing lower case characters (knowing that the input is one).  We
+consider the digit classification task as the target task and we want to
+evaluate whether training with the other tasks can help or hurt, and
+whether the effect is different for MLPs versus SDAs.  The goal is to find
+out if deep learning can benefit more (or less) from multiple related tasks
+(i.e. the multi-task setting) compared to a corresponding purely supervised
+shallow learner.
+
+We use a single hidden layer MLP with 1000 hidden units, and a SDA
+with 3 hidden layers (1000 hidden units per layer), pre-trained and
+fine-tuned on NIST.
+
+Our results show that the MLP benefits marginally from the multi-task setting
+in the case of digits (5\% relative improvement) but is actually hurt in the case
+of characters (respectively 3\% and 4\% worse for lower and upper class characters).
+On the other hand the SDA benefited from the multi-task setting, with relative
+error rate improvements of 27\%, 15\% and 13\% respectively for digits,
+lower and upper case characters, as shown in Table~\ref{tab:multi-task}.
+\fi
+
+
+\vspace*{-2mm}
+\section{Conclusions and Discussion}
+\vspace*{-2mm}
+
+We have found that out-of-distribution examples (multi-task learning
+and perturbed examples) are more beneficial
+to a deep learner than to a traditional shallow and purely
+supervised learner. More precisely, 
+the answers are positive for all the questions asked in the introduction.
+%\begin{itemize}
+
+$\bullet$ %\item 
+{\bf Do the good results previously obtained with deep architectures on the
+MNIST digits generalize to a much larger and richer (but similar)
+dataset, the NIST special database 19, with 62 classes and around 800k examples}?
+Yes, the SDA {\em systematically outperformed the MLP and all the previously
+published results on this dataset} (the ones that we are aware of), {\em in fact reaching human-level
+performance} at around 17\% error on the 62-class task and 1.4\% on the digits,
+and beating previously published results on the same data.
+
+$\bullet$ %\item 
+{\bf To what extent do out-of-distribution examples help deep learners,
+and do they help them more than shallow supervised ones}?
+We found that distorted training examples not only made the resulting
+classifier better on similarly perturbed images but also on
+the {\em original clean examples}, and more importantly and more novel,
+that deep architectures benefit more from such {\em out-of-distribution}
+examples. Shallow MLPs were helped by perturbed training examples when tested on perturbed input 
+images (65\% relative improvement on NISTP) 
+but only marginally helped (5\% relative improvement on all classes) 
+or even hurt (10\% relative loss on digits)
+with respect to clean examples. On the other hand, the deep SDAs
+were significantly boosted by these out-of-distribution examples.
+Similarly, whereas the improvement due to the multi-task setting was marginal or
+negative for the MLP (from +5.6\% to -3.6\% relative change), 
+it was quite significant for the SDA (from +13\% to +27\% relative change),
+which may be explained by the arguments below.
+Since out-of-distribution data
+(perturbed or from other related classes) is very common, this conclusion
+is of practical importance.
+%\end{itemize}
+
+In the original self-taught learning framework~\citep{RainaR2007}, the
+out-of-sample examples were used as a source of unsupervised data, and
+experiments showed its positive effects in a \emph{limited labeled data}
+scenario. However, many of the results by \citet{RainaR2007} (who used a
+shallow, sparse coding approach) suggest that the {\em relative gain of self-taught
+learning vs ordinary supervised learning} diminishes as the number of labeled examples increases.
+We note instead that, for deep
+architectures, our experiments show that such a positive effect is accomplished
+even in a scenario with a \emph{large number of labeled examples},
+i.e., here, the relative gain of self-taught learning and
+out-of-distribution examples is probably preserved
+in the asymptotic regime. However, note that in our perturbation experiments
+(but not in our multi-task experiments), 
+even the out-of-distribution examples are labeled, unlike in the
+earlier self-taught learning experiments~\citep{RainaR2007}.
+
+{\bf Why would deep learners benefit more from the self-taught learning 
+framework and out-of-distribution examples}?
+The key idea is that the lower layers of the predictor compute a hierarchy
+of features that can be shared across tasks or across variants of the
+input distribution. A theoretical analysis of generalization improvements
+due to sharing of intermediate features across tasks already points
+towards that explanation~\cite{baxter95a}.
+Intermediate features that can be used in different
+contexts can be estimated in a way that allows to share statistical 
+strength. Features extracted through many levels are more likely to
+be more abstract and more invariant to some of the factors of variation
+in the underlying distribution (as the experiments in~\citet{Goodfellow2009} suggest),
+increasing the likelihood that they would be useful for a larger array
+of tasks and input conditions.
+Therefore, we hypothesize that both depth and unsupervised
+pre-training play a part in explaining the advantages observed here, and future
+experiments could attempt at teasing apart these factors.
+And why would deep learners benefit from the self-taught learning
+scenarios even when the number of labeled examples is very large?
+We hypothesize that this is related to the hypotheses studied
+in~\citet{Erhan+al-2010}. In~\citet{Erhan+al-2010}
+it was found that online learning on a huge dataset did not make the
+advantage of the deep learning bias vanish, and a similar phenomenon
+may be happening here. We hypothesize that unsupervised pre-training
+of a deep hierarchy with out-of-distribution examples initializes the
+model in the basin of attraction of supervised gradient descent
+that corresponds to better generalization. Furthermore, such good
+basins of attraction are not discovered by pure supervised learning
+(with or without out-of-distribution examples) from random initialization, and more labeled examples
+does not allow the shallow or purely supervised models to discover
+the kind of better basins associated
+with deep learning and out-of-distribution examples.
+ 
+A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 
+can be executed on-line at the anonymous site {\tt http://deep.host22.com}.
+
+\iffalse
+\section*{Appendix I: Detailed Numerical Results}
+
+These tables correspond to Figures 2 and 3 and contain the raw error rates for each model and dataset considered.
+They also contain additional data such as test errors on P07 and standard errors.
+
+\begin{table}[ht]
+\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
+26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
+(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
+(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
+and using a validation set to select hyper-parameters and other training choices. 
+\{SDA,MLP\}0 are trained on NIST,
+\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
+The human error rate on digits is a lower bound because it does not count digits that were
+recognized as letters. For comparison, the results found in the literature
+on NIST digits classification using the same test set are included.}
+\label{tab:sda-vs-mlp-vs-humans}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
+Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
+SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
+SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
+SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
+MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
+MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
+MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
+\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
+\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
+\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
+\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Relative change in error rates due to the use of perturbed training data,
+either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
+A positive value indicates that training on the perturbed data helped for the
+given test set (the first 3 columns on the 62-class tasks and the last one is
+on the clean 10-class digits). Clearly, the deep learning models did benefit more
+from perturbed training data, even when testing on clean data, whereas the MLP
+trained on perturbed data performed worse on the clean digits and about the same
+on the clean characters. }
+\label{tab:perturbation-effect}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
+SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
+SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
+MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
+MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Test error rates and relative change in error rates due to the use of
+a multi-task setting, i.e., training on each task in isolation vs training
+for all three tasks together, for MLPs vs SDAs. The SDA benefits much
+more from the multi-task setting. All experiments on only on the
+unperturbed NIST data, using validation error for model selection.
+Relative improvement is 1 - single-task error / multi-task error.}
+\label{tab:multi-task}
+\begin{center}
+\begin{tabular}{|l|r|r|r|} \hline
+             & single-task  & multi-task  & relative \\ 
+             & setting      & setting     & improvement \\ \hline
+MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
+MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
+MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
+SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
+SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
+SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\fi
+
+%\afterpage{\clearpage}
+%\clearpage
+{
+%\bibliographystyle{spbasic}      % basic style, author-year citations
+\bibliographystyle{plainnat}
+\bibliography{strings,strings-short,strings-shorter,ift6266_ml,specials,aigaion-shorter}
+%\bibliographystyle{unsrtnat}
+%\bibliographystyle{apalike}
+}
+
+
+\end{document}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/aistats_review_response.txt	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,31 @@
+
+We thank the authors for their thoughtful comments. Please find our responses below.
+
+* Comparisons with shallower networks, but using unsupervised pre-training. We have added those results to the paper. On the NIST test set, 62 classes,
+using NISTP to train (which gives the best results on NIST):
+  MLP (1 hidden layer, no unsupervised pre-training): 24% error
+  DA  (1 hidden layer, unsupervised pre-training):    21% error
+  SDA (2 hidden layers, unsupervised pre-training):   20% error
+  SDA (3 hidden layers, unsupervised pre-training):   17% error
+Previous work in our group with very similar data (the InfiniteMNIST dataset were published in JMLR in 2010 "Why Does Unsupervised Pre-training Help Deep Learning?"). The results indeed show improvement when going from 1 to 2 and then 3 layers, even when using unsupervised pre-training (RBM or Denoising Auto-Encoder). The experiment helps to disentangle to some extent the effect of depth with the effect of unsupervised pre-training, and confirms that both are required to achieve the best results.
+
+* Comparisons with SVMs. The main limitation of course is the size of the training set. One option is to use a non-linear SVM with a reduced training set, and the other is to use an online linear SVM.  Another option is to project the input non-linearly in a high-dimensional but sparse representation and then use an online linear SVM.  For this, we have thresholded input pixel gray levels and projected into the space of order-2 products. Results:
+
+SVM type   training set   input               online    validation test set
+            type / size   features            training  set error    error
+                                              error
+Linear SVM,  NIST,  651k,  original,           36.62%,  34.41%,     42.26%
+Linear SVM,  NIST,  651k,  sparse quadratic,   30.96%,  28.00%,     41.28%
+Linear SVM,  NISTP, 800k,  original,           88.50%,  85.24%,     87.36%
+Linear SVM,  NISTP, 800k,  sparse quadratic,   81.76%,  83.69%,     85.56%
+RBF SVM,     NISTP, 100k,  original,           74.73%,  56.57%,     64.22%
+
+The best results were obtained with the sparse quadratic input features, and training on the clean data (NIST) rather than the perturbed data (NISTP).  A summary of the above results was added to the revised paper.
+
+
+* Using distorted characters as the corruption process of the Denoising Auto-Encoder (DAE). We had already performed preliminary experiments with this idea and results varied depending on the type of distortion, but did not improve on the original noise process. We believe that the DAE learns good features when the target to reconstruct is more likely than the corrupted input.  concentrate. Hence distortions that are *plausible* in the input distribution (such as translation, rotation, scaling, etc.) are not very useful, whereas corruption due to a form of noise are useful. Consider also the symmetries involved: a translation is as likely to be to the right or to the left, so it is hard to predict.
+
+* Human labeling: We controlled noise in the labelling process by (1) requiring AMT workers with a higher than normal average of accepted responses (>95%) on other tasks (2) discarding responses that were not complete (10 predictions) (3) discarding responses for which for which the time to predict was smaller than 3 seconds for NIST (the mean response time was 20 seconds) and 6 seconds seconds for NISTP (average response time of 45 seconds) (4) discarding responses which were obviously wrong (10 identical ones, or "12345..."). Overall, after such filtering, we kept approximately 95% of the AMT workers' responses. The above paragraph was added to the revision. We thank the reviewer for the suggestion about multi-stage questionnaires, we will definitely consider this as an option next time we perform this experiment. However, to be fair, if we were to do so, we should also consider the same multi-stage decision process for the machine learning algorithms as well.
+
+* Size of labeled set: in our JMLR 2010 paper on deep learning (cited above, see fig. 11), we already verified the effect of number of labeled examples on the deep learners and shallow learners (with or without unsupervised pre-training). Basically (and somewhat surprisingly) the deep learners with unsupervised pre-training can take more advantage of a large amount of labeled examples, presumably because of the initialization effect and the effect does not disappear when the number of labeled examples increases. Similar results were obtained in the semi-supervised setting (Lee et al, NIPS2009).  Adding the training curve in the self-taught settings of this AISTAT submission is a good idea, and we will have it for the final version.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/authors	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,19 @@
+Yoshua  Bengio
+Frédéric  Bastien
+Arnaud  Bergeron
+Nicolas  Boulanger-Lewandowski
+Thomas  Breuel
+Youssouf  Chherawala
+Moustapha  Cisse 
+Myriam  Côté 
+Dumitru  Erhan
+Jeremy  Eustache
+Xavier  Glorot 
+Xavier  Muller
+Sylvain  Pannetier Lebeuf
+Razvan  Pascanu 
+Salah  Rifai 
+Francois  Savard 
+Guillaume  Sicard
+
+bengioy@iro.umontreal.ca; bastienf@iro.umontreal.ca; bergearn@iro.umontreal.ca; boulanni@iro.umontreal.ca; tmb@informatik.uni-kl.de; chherawy@iro.umontreal.ca; cissemou@iro.umontreal.ca; cotemyri@iro.umontreal.ca; erhandum@iro.umontreal.ca; eustachj@iro.umontreal.ca; glorotxa@iro.umontreal.ca; mullerx@iro.umontreal.ca; pannetis@iro.umontreal.ca; r.pascanu@gmail.com; rifaisal@iro.umontreal.ca; francois.savard@polymtl.ca; guitch21@gmail.com
--- a/writeup/ift6266_ml.bib	Sat Mar 19 22:49:33 2011 -0400
+++ b/writeup/ift6266_ml.bib	Sat Mar 19 22:51:40 2011 -0400
@@ -1688,7 +1688,6 @@
   address =      "Santa Cruz, California",
   pages =        "311--320",
   year =         "1995",
-  url =          "http://citeseer.ist.psu.edu/baxter95learning.html",
 }
 
 @Unpublished{baxter95b,
@@ -9753,7 +9752,7 @@
 
 
 @Article{Hinton06,
-  author =       "Goeffrey E. Hinton and Simon Osindero and {Yee Whye} Teh",
+  author =       "Geoffrey E. Hinton and Simon Osindero and {Yee Whye} Teh",
   title =        "A fast learning algorithm for deep belief nets",
   journal =      "Neural Computation",
   volume =       "18",
@@ -25843,3 +25842,13 @@
  institution = "University X.",
  year = 2010,
 }
+
+
+@techreport{Vincent-SM-2010,
+ author = "Pascal Vincent",
+ title = "A connection between {S}core {M}atching and {D}enoising {A}utoencoders",
+ institution = "Universite de Montreal",
+ number = 1359,
+ year = 2010,
+}
+ 
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/jmlr_review1.txt	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,77 @@
+This paper presents an experimental analysis of the generalization effects of supervised learning leveraging additional out-of-distribution data and certain kinds of perturbations and transformations of examples (handwritten characters). Overall, I feel the paper is interesting, but in its current form the basic content would be more suitable for a conference publication than JMLR.
+There are no new algorithmic advances proposed, as the authors use a number of existing techniques (neural networks, deep learning auto-encoders, multi-task learning, semi-supervised learning and self-taught learning).
+What they do show is that some combinations of these approaches might be quite useful for deep networks. However, I feel there are some missing points both in the text and the experiments themselves, that I detail below.
+
+Comments about the Introduction:
+
+Firstly, a small point: the introduction does not do a good job in explaining the two main topics: "deep learning" and particularly "self-taught learning" (and as there is really no "middle" part of this paper, it just goes straight to experiments after the introduction, there is little elsewhere, either). The deep-learning paragraph explains multi-layer neural nets and why they might be useful, and states "deep learning has emerged as a promising new area of research", but it seems to me the only new area of research is the way they are trained, which should be explained here in the text -- that is not mentioned which is misleading.  (Actually something about deep learning is explained later, but it seems as if it is in the wrong section, it is in the ``self-taught learning'' paragraphs.) More importantly I feel that the self-taught learning section fails to explain adequately what self-taught learning even is. It is written:
+
+``Self-taught learning (Raina et al., 2007) is a paradigm that combines principles of semi-supervised and multi-task learning: the learner can exploit examples that are unlabeled and possibly come from a distribution different from the target distribution, e.g., from other classes than those of interest.''
+
+Firstly, this tries to explain one concept by introducing two others that are not explained (semi-supervised learning and multi-task learning). Secondly, I don't think it's clear from that description that there is also labeled data involved here. I think Raina's website explains it more clearly:
+
+"In self-taught learning, we are given a small amount of labeled data for a supervised learning task, and lots of additional unlabeled data that does not share the labels of the supervised problem and does not arise from the same distribution. This paper introduces an algorithm for self-taught learning based on sparse coding."
+
+Comparing the two descriptions I also find the word ``possibly'' troubling in the paper -- why write ``possibly'' here? If the data is not out-of-distribution, then this is just semi-supervised learning, isn't it? 
+
+I think as this paper hinges on deep learning and self-taught learning more should be done to explain them. In particular, very little of Raina et al.'s approach is explained, e.g. the algorithm they used or the experiments that were conducted. Moreover, other papers have worked on the same setting, and a section discussing prior work should be added. In particular:
+
+	J. Weston, R. Collobert, F. Sinz, L. Bottou and V. Vapnik. "Inference with the Universum", ICML 2006
+
+also studies algorithms for learning with labeled data + out-of-sample unlabeled data, and even have experiments with hand-written character recognition with many classes.
+Also, I guess that several works have looked at learning in the case of a different distribution in training than in test, e.g to name one:
+
+	Yishay Mansour, Mehryar Mohri, and Afshin Rostamizadeh. 
+	Domain adaptation: Learning bounds and algorithms. 
+	In Proceedings of The 22nd Annual Conference on Learning Theory (COLT 2009). Montréal, Canada, June 2009. Omnipress. Longer arxiv version.
+
+Perhaps that area of research is worth mentioning too.
+
+The introduction also states:  ``It has already been shown that deep learners can clearly take advantage of unsupervised learning and unlabeled examples (Bengio, 2009; Weston et al., 2008),  but more needs to be done to explore the impact of out-of-distribution examples and of the multi-task setting (one exception is (Collobert and Weston, 2008), which uses a different kind of learning algorithm). In particular the relative advantage of deep learning for these settings has not been evaluated. "
+
+Several points here:
+- The first sentence should make it clear this is semi-supervised learning that also uses labeled examples (I do not think it is clear).
+- I'm not sure what ``which uses a different kind of learning algorithm'' means -- different to what? To the algorithm in this paper, to Raina et al., or something else.. ? 
+- I believe if one is going to discuss the multi-task setting, then several other works should be cited and explained, in particular:
+	Rich Caruana, "Multitask Learning," Ph.D. Thesis, School of Computer Science, CMU, 1997.
+for multi-tasking in neural networks (although I am sure there are many other works as well), and:
+        A Framework for Learning Predictive Structures from Multiple Tasks and Unlabeled Data. Rie K. Ando and Tong Zhang. Journal of Machine Learning Research, Vol 6:1817-1853, 2005. 
+which uses multi-tasking in the setting of semi-supervised learning. I'm sure there are other works as well.
+- Finally, I believe there are more ``exceptions'' than Collobert and Weston, 2008. For example: 
+	H. Mobahi, R. Collobert, J. Weston. Deep Learning from Temporal Coherence in Video. ICML 2009. 
+seems to directly compare within distribution and out-of-distribution unlabeled data for convolutional networks. The fact that there are already papers on this topic (and that you do not take the time to explain the differences between these and your own work) lessens the impact. 
+
+I think the phrase ``Whereas a deep architecture can in principle be more powerful than a shallow one in terms of representation'' cannot be written without at least a citation, and I think it depends what you mean by the word ``powerful'' doesn't it? E.g. can't you have infinite VC dimension with a shallow representation?  (Also, I don't think you define what a ``shallow learner'' is anywhere, more explanation always helps.) Also, I feel it would be better if ``sharing of statistical strength'', which is in italics, was explained.
+
+Finally, this is just a style point, but I feel there is too much use of bold and italics at the end of the introduction. You should sell your paper, but sometimes one can go overboard.
+ 
+
+Section 2:
+
+Section 2 is a relatively large chunk of the paper (3 pages) and could basically be put in the appendix, couldn't it? Or else, little is done to justify why it is placed in the paper right after the introduction.  After that section, we are straight to the experiments -- it feels like the paper has some missing sections and was not fully written somehow. It goes straight from the introduction to ``Perturbed and Transformed Character Images'' which is not what I was expecting. For example, I was expected more details of self-taught learning and why it would help.
+I think describing/citing previous work on learning invariances and transforming images would make sense in the context of this section too.
+
+Experiments:
+
+``Much previous work on deep learning had been performed on the MNIST digits task (Hinton et al., 2006; Ranzato et al., 2007; Bengio et al., 2007; Salakhutdinov and Hinton, 2009), with 60 000 examples, and variants involving 10 000 examples (Larochelle et al., 2009b; Vincent et al., 2008b). The focus here is on much larger training sets, from 10 times to to 1000 times larger, and 62 classes.''
+I feel this is unfair. There are many large scale deep learning papers with large datasets. You should make that clear, e.g.:
+
+	Large-scale Deep Unsupervised Learning using Graphics Processors, Rajat Raina, Anand Madhavan, Andrew Y. Ng , ICML 2009
+
+to name one, but there are many others...
+
+Sec. 3: `` The average error of humans on the 62-class task NIST test set is 18.2%, with a standard error of 0.1%.''.  I think at this point you should explain why this is so high.
+
+``Preliminary experiments on training SVMs (libSVM) with subsets of the training set allowing the program to fit in memory yielded substantially worse results than MLPs."
+-- I think you should mention here work done trying to speed up SVMs for exactly this task, e.g.:
+	Gaëlle Loosli, Stéphane Canu and Léon Bottou: Training Invariant Support Vector Machines using Selective Sampling, in Large Scale Kernel Machines,  301–320, MIT Press, Cambridge, MA., 2007.
+where the authors trained an SVM on 8100000 examples generated from MNIST. Also, showing a learning curve might be nice if you can't do the training with the full data. 
+
+The experimental results look on the whole good. However, I still feel the following issues could be resolved:
+
+- The shallow MLP as I understand has a max number of hidden units of 1500, whereas the deep MLP has three layers of 1000 hidden units. Hence, the deep MLPs have a lot more capacity. So shouldn't you try shallow MLPs with more hidden units? It would also be good to show training and test error rates for different number of hidden units.
+- If many shallow and deep MLP methods, and other non-MLP methods, have been compared on MNIST, why not comparing on that as well? You can still do this in a self-taught learning setup, e.g. using other data as unlabeled data, no?
+- The idea of transforming digits seems closer to learning invariances than self-taught learning to me? This should be discussed.
+- There is no comparison to Raina et. al, despite using their idea of ``self-taught learning'' in the title. Indeed, could Raina et al.'s algorithm be compared in both shallow and deep mode? I feel as this is only an experimental paper, more permutations could be done to understand this phenomenon more.
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/jmlr_review2.txt	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,11 @@
+The paper “Deep Self-Taught Learning for Handwritten Character Recognition” by Bengio et al. claims that deep neural networks benefit more from self-taught learning than shallow ones.
+
+The paper presents neural network models applied to handwritten character recognition. Various transformations and noise injection modes to generate additional training data are introduced to get so called “out-of-distribution” examples. MLPs with one hidden layer are then trained on various date sets in a fully supervised way and compared with three-hidden layer MLPs where each layer is initialized in an unsupervised way and then fine tuned using Back-Propagation. It is then concluded that deep learners benefit more from out-of-distribution examples as well as from a multi-task setting.
+
+It is well known that artificially increasing the training data by either adding noise or by incorporation some prior knowledge in the generation of new data points acts as a regularizer and help to improve performance (Simard et al. 2003, ICDAR). It is therefore not very surprising that deep architectures with a higher complexity profit more from this procedure. The paper suggests that MLPs (with one hidden layer) perform worse than deep SDAs (i.e. pretrained MLPs with three hidden layers), especially when the training data is artificially increased.  I would argue that an MLP with three hidden layers trained in a fully supervised way would also perform better with respect to the 1-hidden layer MLP. Therefore it would have been interesting to see results of such an MLP. Only in this way a fair comparison between shallow vs. deep MLPs, as well as supervised vs. unsupervised training, would be possible.
+
+This paper claims that deep architectures with unsupervised pre-training outperform shallow ones and that additional training data is more beneficial for deep architectures. I think the authors should have compared their SDA with a 3-hidden-layer MLPs to support this claim. Furthermore it is claimed that unsupervised pre-training is required to successfully train deep (3-hidden-layer) MLPs. However, there are no experiments in this paper that justify this claim and I also would argue that deep MLPs can be successfully trained with Back-Propagation, especially if enough training data is available (Ciresan et al 2010, Neural Computation). I therefore strongly encourage the authors to either include the result of such an experiment or adjust the conclusion accordingly.
+
+To cut a long story short, this paper wants to establish SDAs as the state of the art for character recognition, without even checking if deep MLPs trained in the usual supervised way are better or not. I run a simple test and trained a three hidden-layer MLP (500-500-500) on deformed NIST and obtained a test error recognition rate of 1.08% on the un-deformed NIST test set compare with 1.4% of the SDA in Table 1. For this particular task a three hidden layer MLP outperforms an even bigger SDA. I am therefore not fully convinced if supervised pretraining is necessary to obtain good performance for the presented task.
+
+The extensive use of font styles made it hard to follow the paper. It is also very difficult to understand with what data which networks were trained. Especially in Table 1, Appendix it is not clear if the nets in the last column tested on digits are trained on 62 characters or only on digits.
--- a/writeup/nips10submit_e.sty	Sat Mar 19 22:49:33 2011 -0400
+++ b/writeup/nips10submit_e.sty	Sat Mar 19 22:51:40 2011 -0400
@@ -32,7 +32,7 @@
 
 % Define nipsfinal, set to true if nipsfinalcopy is defined  
 \newif\ifnipsfinal
-\nipsfinalfalse
+\nipsfinaltrue
 \def\nipsfinalcopy{\nipsfinaltrue}
 \font\nipstenhv  = phvb at 8pt % *** IF THIS FAILS, SEE nips10submit_e.sty ***
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/nips2010_cameraready.tex	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,1060 @@
+\documentclass{article} % For LaTeX2e
+\usepackage{nips10submit_e,times}
+\usepackage{wrapfig}
+\usepackage{amsthm,amsmath,bbm} 
+\usepackage[psamsfonts]{amssymb}
+\usepackage{algorithm,algorithmic}
+\usepackage[utf8]{inputenc}
+\usepackage{graphicx,subfigure}
+\usepackage[numbers]{natbib}
+
+\addtolength{\textwidth}{10mm}
+\addtolength{\textheight}{10mm}
+\addtolength{\topmargin}{-5mm}
+\addtolength{\evensidemargin}{-5mm}
+\addtolength{\oddsidemargin}{-5mm}
+
+%\setlength\parindent{0mm}
+
+\title{Deep Self-Taught Learning for Handwritten Character Recognition}
+\author{
+Frédéric  Bastien, 
+Yoshua  Bengio, 
+Arnaud  Bergeron, 
+Nicolas  Boulanger-Lewandowski,
+Thomas  Breuel,\\ 
+{\bf Youssouf  Chherawala, 
+Moustapha  Cisse, 
+Myriam  Côté, 
+Dumitru  Erhan,
+Jeremy  Eustache,}\\
+{\bf Xavier  Glorot, 
+Xavier  Muller,
+Sylvain  Pannetier Lebeuf,
+Razvan  Pascanu,} \\
+{\bf Salah  Rifai, 
+Francois  Savard, 
+Guillaume  Sicard}\\
+Dept. IRO, U. Montreal 
+}
+
+\begin{document}
+
+%\makeanontitle
+\maketitle
+
+\vspace*{-2mm}
+\begin{abstract}
+  Recent theoretical and empirical work in statistical machine learning has
+  demonstrated the importance of learning algorithms for deep
+  architectures, i.e., function classes obtained by composing multiple
+  non-linear transformations. Self-taught learning (exploiting unlabeled
+  examples or examples from other distributions) has already been applied
+  to deep learners, but mostly to show the advantage of unlabeled
+  examples. Here we explore the advantage brought by {\em out-of-distribution examples}.
+For this purpose we
+  developed a powerful generator of stochastic variations and noise
+  processes for character images, including not only affine transformations
+  but also slant, local elastic deformations, changes in thickness,
+  background images, grey level changes, contrast, occlusion, and various
+  types of noise. The out-of-distribution examples are obtained from these
+  highly distorted images or by including examples of object classes
+  different from those in the target test set.
+  We show that {\em deep learners benefit
+    more from them than a corresponding shallow learner}, at least in the area of
+  handwritten character recognition. In fact, we show that they reach
+  human-level performance on both handwritten digit classification and
+  62-class handwritten character recognition.  
+\end{abstract}
+\vspace*{-3mm}
+
+\section{Introduction}
+\vspace*{-1mm}
+
+{\bf Deep Learning} has emerged as a promising new area of research in
+statistical machine learning~\citep{Hinton06}
+(see \citet{Bengio-2009} for a review).
+Learning algorithms for deep architectures are centered on the learning
+of useful representations of data, which are better suited to the task at hand,
+and are organized in a hierarchy with multiple levels.
+This is in part inspired by observations of the mammalian visual cortex, 
+which consists of a chain of processing elements, each of which is associated with a
+different representation of the raw visual input. In fact,
+it was found recently that the features learnt in deep architectures resemble
+those observed in the first two of these stages (in areas V1 and V2
+of visual cortex)~\citep{HonglakL2008}, and that they become more and
+more invariant to factors of variation (such as camera movement) in
+higher layers~\citep{Goodfellow2009}.
+It has been hypothesized that learning a hierarchy of features increases the
+ease and practicality of developing representations that are at once
+tailored to specific tasks, yet are able to borrow statistical strength
+from other related tasks (e.g., modeling different kinds of objects). Finally, learning the
+feature representation can lead to higher-level (more abstract, more
+general) features that are more robust to unanticipated sources of
+variance extant in real data.
+
+{\bf Self-taught learning}~\citep{RainaR2007} is a paradigm that combines principles
+of semi-supervised and multi-task learning: the learner can exploit examples
+that are unlabeled and possibly come from a distribution different from the target
+distribution, e.g., from other classes than those of interest. 
+It has already been shown that deep learners can clearly take advantage of
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small},
+but more needs to be done to explore the impact
+of {\em out-of-distribution} examples and of the multi-task setting
+(one exception is~\citep{CollobertR2008}, which uses a different kind
+of learning algorithm). In particular the {\em relative
+advantage} of deep learning for these settings has not been evaluated.
+The hypothesis discussed in the conclusion is that a deep hierarchy of features
+may be better able to provide sharing of statistical strength
+between different regions in input space or different tasks.
+
+Previous comparative experimental results with stacking of RBMs and DAs
+to build deep supervised predictors had shown that they could outperform
+shallow architectures in a variety of settings, especially
+when the data involves complex interactions between many factors of 
+variation~\citep{LarochelleH2007,Bengio-2009}. Other experiments have suggested
+that the unsupervised layer-wise pre-training acted as a useful
+prior~\citep{Erhan+al-2010} that allows one to initialize a deep
+neural network in a relatively much smaller region of parameter space, 
+corresponding to better generalization.
+
+To further the understanding of the reasons for the good performance
+observed with deep learners, we focus here on the following {\em hypothesis}:
+intermediate levels of representation, especially when there are
+more such levels, can be exploited to {\bf share
+statistical strength across different but related types of examples},
+such as examples coming from other tasks than the task of interest
+(the multi-task setting), or examples coming from an overlapping
+but different distribution (images with different kinds of perturbations
+and noises, here). This is consistent with the hypotheses discussed
+in~\citet{Bengio-2009} regarding the potential advantage
+of deep learning and the idea that more levels of representation can
+give rise to more abstract, more general features of the raw input.
+
+This hypothesis is related to the
+{\bf self-taught learning} setting~\citep{RainaR2007}, which combines principles
+of semi-supervised and multi-task learning: the learner can exploit examples
+that are unlabeled and possibly come from a distribution different from the target
+distribution, e.g., from classes other than those of interest. 
+It has already been shown that deep learners can take advantage of
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small},
+but more needed to be done to explore the impact
+of {\em out-of-distribution} examples and of the {\em multi-task} setting
+(one exception is~\citep{CollobertR2008}, which shares and uses unsupervised
+pre-training only with the first layer). In particular the {\em relative
+advantage of deep learning} for these settings had not been evaluated.
+
+
+%
+The {\bf main claim} of this paper is that deep learners (with several levels of representation) can
+{\bf benefit more from out-of-distribution examples than shallow learners} (with a single
+level), both in the context of the multi-task setting and from
+ perturbed examples. Because we are able to improve on state-of-the-art
+performance and reach human-level performance
+on a large-scale task, we consider that this paper is also a contribution
+to advance the application of machine learning to handwritten character recognition.
+More precisely, we ask and answer the following questions:
+
+%\begin{enumerate}
+$\bullet$ %\item 
+Do the good results previously obtained with deep architectures on the
+MNIST digit images generalize to the setting of a similar but much larger and richer
+dataset, the NIST special database 19, with 62 classes and around 800k examples?
+
+$\bullet$ %\item 
+To what extent does the perturbation of input images (e.g. adding
+noise, affine transformations, background images) make the resulting
+classifiers better not only on similarly perturbed images but also on
+the {\em original clean examples}? We study this question in the
+context of the 62-class and 10-class tasks of the NIST special database 19.
+
+$\bullet$ %\item 
+Do deep architectures {\em benefit {\bf more} from such out-of-distribution}
+examples, in particular do they benefit more from 
+examples that are perturbed versions of the examples from the task of interest?
+
+$\bullet$ %\item 
+Similarly, does the feature learning step in deep learning algorithms benefit {\bf more}
+from training with moderately {\em different classes} (i.e. a multi-task learning scenario) than
+a corresponding shallow and purely supervised architecture?
+We train on 62 classes and test on 10 (digits) or 26 (upper case or lower case)
+to answer this question.
+%\end{enumerate}
+
+Our experimental results provide positive evidence towards all of these questions,
+as well as {\bf classifiers that reach human-level performance on 62-class isolated character
+recognition and beat previously published results on the NIST dataset (special database 19)}.
+To achieve these results, we introduce in the next section a sophisticated system
+for stochastically transforming character images and then explain the methodology,
+which is based on training with or without these transformed images and testing on 
+clean ones. 
+Code for generating these transformations as well as for the deep learning 
+algorithms are made available at {\tt http://hg.assembla.com/ift6266}.
+
+\vspace*{-3mm}
+%%\newpage
+\section{Perturbed and Transformed Character Images}
+\label{s:perturbations}
+\vspace*{-2mm}
+
+%\begin{minipage}[h]{\linewidth}
+\begin{wrapfigure}[8]{l}{0.15\textwidth}
+%\begin{minipage}[b]{0.14\linewidth}
+\vspace*{-5mm}
+\begin{center}
+\includegraphics[scale=.4]{images/Original.png}\\
+{\bf Original}
+\end{center}
+\end{wrapfigure}
+%\vspace{0.7cm}
+%\end{minipage}%
+%\hspace{0.3cm}\begin{minipage}[b]{0.86\linewidth}
+This section describes the different transformations we used to stochastically
+transform $32 \times 32$ source images (such as the one on the left)
+in order to obtain data from a larger distribution which
+covers a domain substantially larger than the clean characters distribution from
+which we start.
+Although character transformations have been used before to
+improve character recognizers, this effort is on a large scale both
+in number of classes and in the complexity of the transformations, hence
+in the complexity of the learning task.
+More details can
+be found in this technical report~\citep{ARXIV-2010}.
+The code for these transformations (mostly python) is available at 
+{\tt http://hg.assembla.com/ift6266}. All the modules in the pipeline share
+a global control parameter ($0 \le complexity \le 1$) modulating the
+amount of deformation or noise. 
+There are two main parts in the pipeline. The first one,
+from thickness to pinch, performs transformations. The second
+part, from blur to contrast, adds different kinds of noise.
+%\end{minipage}
+
+%\newpage
+\vspace*{1mm}
+%\subsection{Transformations}
+{\large\bf 2.1 Transformations}
+\vspace*{1mm}
+
+
+\begin{minipage}[h]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+%\begin{minipage}[b]{0.14\linewidth}
+%\centering
+\begin{center}
+\vspace*{-5mm}
+\includegraphics[scale=.4]{images/Thick_only.png}\\
+{\bf Thickness}
+\end{center}
+%\vspace{.6cm}
+%\end{minipage}%
+%\hspace{0.3cm}\begin{minipage}[b]{0.86\linewidth}
+\end{wrapfigure}
+To change character {\bf thickness}, morphological operators of dilation and erosion~\citep{Haralick87,Serra82}
+are applied. The neighborhood of each pixel is multiplied
+element-wise with a {\em structuring element} matrix.
+The pixel value is replaced by the maximum or the minimum of the resulting
+matrix, respectively for dilation or erosion. Ten different structural elements with 
+increasing dimensions (largest is $5\times5$) were used.  For each image, 
+randomly sample the operator type (dilation or erosion) with equal probability and one structural
+element from a subset of the $n=round(m \times complexity)$ smallest structuring elements
+where $m=10$ for dilation and $m=6$ for erosion (to avoid completely erasing thin characters).  
+A neutral element (no transformation) 
+is always present in the set.
+%\vspace{.4cm}
+\end{minipage}
+\vspace*{3mm}
+
+\begin{minipage}[h]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+%\begin{minipage}[b]{0.14\linewidth}
+%\centering
+\begin{center}
+\vspace*{-5mm}
+\includegraphics[scale=.4]{images/Slant_only.png}\\
+{\bf Slant}
+\end{center}
+\end{wrapfigure}
+
+%\end{minipage}%
+%\hspace{0.3cm}
+%\begin{minipage}[b]{0.83\linewidth}
+%\centering
+To produce {\bf slant}, each row of the image is shifted
+proportionally to its height: $shift = round(slant \times height)$.  
+$slant \sim U[-complexity,complexity]$.
+The shift is randomly chosen to be either to the left or to the right.
+%\vspace{8mm}
+\end{minipage}
+\vspace*{10mm}
+
+\begin{minipage}[h]{\linewidth}
+%\begin{minipage}[b]{0.14\linewidth}
+%\centering
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+\begin{center}
+\vspace*{-5mm}
+\includegraphics[scale=.4]{images/Affine_only.png}\\
+{\small {\bf Affine \mbox{Transformation}}}
+\end{center}
+\end{wrapfigure}
+%\end{minipage}%
+%\hspace{0.3cm}\begin{minipage}[b]{0.86\linewidth}
+A $2 \times 3$ {\bf affine transform} matrix (with
+parameters $(a,b,c,d,e,f)$) is sampled according to the $complexity$.
+Output pixel $(x,y)$ takes the value of input pixel
+nearest to $(ax+by+c,dx+ey+f)$,
+producing scaling, translation, rotation and shearing.
+Marginal distributions of $(a,b,c,d,e,f)$ have been tuned to
+forbid large rotations (to avoid confusing classes) but to give good
+variability of the transformation: $a$ and $d$ $\sim U[1-3
+complexity,1+3\,complexity]$, $b$ and $e$ $\sim U[-3 \,complexity,3\,
+complexity]$, and $c$ and $f \sim U[-4 \,complexity, 4 \,
+complexity]$.\\
+%\end{minipage}
+\end{minipage}
+\vspace*{3mm}
+
+\vspace*{-4.5mm}
+
+\begin{minipage}[h]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+%\hspace*{-8mm}\begin{minipage}[b]{0.25\linewidth}
+%\centering
+\begin{center}
+\vspace*{-4mm}
+\includegraphics[scale=.4]{images/Localelasticdistorsions_only.png}\\
+{\bf Local Elastic Deformation}
+\end{center}
+\end{wrapfigure}
+%\end{minipage}%
+%\hspace{-3mm}\begin{minipage}[b]{0.85\linewidth}
+%\vspace*{-20mm}
+The {\bf local elastic deformation}
+module induces a ``wiggly'' effect in the image, following~\citet{SimardSP03-short},
+which provides more details. 
+The intensity of the displacement fields is given by 
+$\alpha = \sqrt[3]{complexity} \times 10.0$, which are 
+convolved with a Gaussian 2D kernel (resulting in a blur) of
+standard deviation $\sigma = 10 - 7 \times\sqrt[3]{complexity}$.
+%\vspace{.9cm}
+\end{minipage}
+
+\vspace*{7mm}
+
+%\begin{minipage}[b]{0.14\linewidth}
+%\centering
+\begin{minipage}[h]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+\vspace*{-5mm}
+\begin{center}
+\includegraphics[scale=.4]{images/Pinch_only.png}\\
+{\bf Pinch}
+\end{center}
+\end{wrapfigure}
+%\vspace{.6cm}
+%\end{minipage}%
+%\hspace{0.3cm}\begin{minipage}[b]{0.86\linewidth}
+The {\bf pinch} module applies the ``Whirl and pinch'' GIMP filter with whirl set to 0. 
+A pinch is ``similar to projecting the image onto an elastic
+surface and pressing or pulling on the center of the surface'' (GIMP documentation manual).
+For a square input image, draw a radius-$r$ disk
+around its center $C$. Any pixel $P$ belonging to
+that disk has its value replaced by
+the value of a ``source'' pixel in the original image,
+on the line that goes through $C$ and $P$, but
+at some other distance $d_2$. Define $d_1=distance(P,C)$
+and $d_2 = sin(\frac{\pi{}d_1}{2r})^{-pinch} \times
+d_1$, where $pinch$ is a parameter of the filter.
+The actual value is given by bilinear interpolation considering the pixels
+around the (non-integer) source position thus found.
+Here $pinch \sim U[-complexity, 0.7 \times complexity]$.
+%\vspace{1.5cm}
+\end{minipage}
+
+\vspace{1mm}
+
+{\large\bf 2.2 Injecting Noise}
+%\subsection{Injecting Noise}
+%\vspace{2mm}
+
+\begin{minipage}[h]{\linewidth}
+%\vspace*{-.2cm}
+%\begin{minipage}[t]{0.14\linewidth}
+\begin{wrapfigure}[8]{l}{0.15\textwidth}
+\begin{center}
+\vspace*{-5mm}
+%\vspace*{-2mm}
+\includegraphics[scale=.4]{images/Motionblur_only.png}\\
+{\bf Motion Blur}
+%\end{minipage}%
+\end{center}
+\end{wrapfigure}
+%\hspace{0.3cm}
+%\begin{minipage}[t]{0.83\linewidth}
+%\vspace*{.5mm}
+The {\bf motion blur} module is GIMP's ``linear motion blur'', which
+has parameters $length$ and $angle$. The value of
+a pixel in the final image is approximately the  mean of the first $length$ pixels
+found by moving in the $angle$ direction,
+$angle \sim U[0,360]$ degrees, and $length \sim {\rm Normal}(0,(3 \times complexity)^2)$.
+\vspace{5mm}
+\end{minipage}
+%\end{minipage}
+
+\vspace*{1mm}
+
+\begin{minipage}[h]{\linewidth}
+\begin{minipage}[t]{0.14\linewidth}
+\centering
+\includegraphics[scale=.4]{images/occlusion_only.png}\\
+{\bf Occlusion}
+%\vspace{.5cm}
+\end{minipage}%
+\hspace{0.3cm}\begin{minipage}[t]{0.83\linewidth}
+\vspace*{-18mm}
+The {\bf occlusion} module selects a random rectangle from an {\em occluder} character
+image and places it over the original {\em occluded}
+image. Pixels are combined by taking the max(occluder, occluded),
+i.e. keeping the lighter ones.
+The rectangle corners
+are sampled so that larger complexity gives larger rectangles.
+The destination position in the occluded image are also sampled
+according to a normal distribution (more details in~\citet{ARXIV-2010}).
+This module is skipped with probability 60\%.
+%\vspace{7mm}
+\end{minipage}
+\end{minipage}
+
+\vspace*{1mm}
+
+\begin{wrapfigure}[8]{l}{0.15\textwidth}
+\vspace*{-3mm}
+\begin{center}
+%\begin{minipage}[t]{0.14\linewidth}
+%\centering
+\includegraphics[scale=.4]{images/Bruitgauss_only.png}\\
+{\bf Gaussian Smoothing}
+\end{center}
+\end{wrapfigure}
+%\vspace{.5cm}
+%\end{minipage}%
+%\hspace{0.3cm}\begin{minipage}[t]{0.86\linewidth}
+With the {\bf Gaussian smoothing} module, 
+different regions of the image are spatially smoothed.
+This is achieved  by first convolving
+the image with an isotropic Gaussian kernel of
+size and variance chosen uniformly in the ranges $[12,12 + 20 \times
+complexity]$ and $[2,2 + 6 \times complexity]$. This filtered image is normalized
+between $0$ and $1$.  We also create an isotropic weighted averaging window, of the
+kernel size, with maximum value at the center.  For each image we sample
+uniformly from $3$ to $3 + 10 \times complexity$ pixels that will be
+averaging centers between the original image and the filtered one.  We
+initialize to zero a mask matrix of the image size. For each selected pixel
+we add to the mask the averaging window centered on it.  The final image is
+computed from the following element-wise operation: $\frac{image + filtered\_image
+\times mask}{mask+1}$.
+This module is skipped with probability 75\%.
+%\end{minipage}
+
+%\newpage
+
+\vspace*{1mm}
+
+%\hspace*{-3mm}\begin{minipage}[t]{0.18\linewidth}
+%\centering
+\begin{minipage}[t]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+\vspace*{-5mm}
+\begin{center}
+\includegraphics[scale=.4]{images/Permutpixel_only.png}\\
+{\small\bf Permute Pixels}
+\end{center}
+\end{wrapfigure}
+%\end{minipage}%
+%\hspace{-0cm}\begin{minipage}[t]{0.86\linewidth}
+%\vspace*{-20mm}
+This module {\bf permutes neighbouring pixels}. It first selects a
+fraction $\frac{complexity}{3}$ of pixels randomly in the image. Each
+of these pixels is then sequentially exchanged with a random pixel
+among its four nearest neighbors (on its left, right, top or bottom).
+This module is skipped with probability 80\%.\\
+\vspace*{1mm}
+\end{minipage}
+
+\vspace{-3mm}
+
+\begin{minipage}[t]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.15\textwidth}
+%\vspace*{-3mm}
+\begin{center}
+%\hspace*{-3mm}\begin{minipage}[t]{0.18\linewidth}
+%\centering
+\vspace*{-5mm}
+\includegraphics[scale=.4]{images/Distorsiongauss_only.png}\\
+{\small \bf Gauss. Noise}
+\end{center}
+\end{wrapfigure}
+%\end{minipage}%
+%\hspace{0.3cm}\begin{minipage}[t]{0.86\linewidth}
+\vspace*{12mm}
+The {\bf Gaussian noise} module simply adds, to each pixel of the image independently, a
+noise $\sim Normal(0,(\frac{complexity}{10})^2)$.
+This module is skipped with probability 70\%.
+%\vspace{1.1cm}
+\end{minipage}
+
+\vspace*{1.2cm}
+
+\begin{minipage}[t]{\linewidth}
+\begin{minipage}[t]{0.14\linewidth}
+\centering
+\includegraphics[scale=.4]{images/background_other_only.png}\\
+{\small \bf Bg Image}
+\end{minipage}%
+\hspace{0.3cm}\begin{minipage}[t]{0.83\linewidth}
+\vspace*{-18mm}
+Following~\citet{Larochelle-jmlr-2009}, the {\bf background image} module adds a random
+background image behind the letter, from a randomly chosen natural image,
+with contrast adjustments depending on $complexity$, to preserve
+more or less of the original character image.
+%\vspace{.8cm}
+\end{minipage}
+\end{minipage}
+%\vspace{-.7cm}
+
+\begin{minipage}[t]{0.14\linewidth}
+\centering
+\includegraphics[scale=.4]{images/Poivresel_only.png}\\
+{\small \bf Salt \& Pepper}
+\end{minipage}%
+\hspace{0.3cm}\begin{minipage}[t]{0.83\linewidth}
+\vspace*{-18mm}
+The {\bf salt and pepper noise} module adds noise $\sim U[0,1]$ to random subsets of pixels.
+The number of selected pixels is $0.2 \times complexity$.
+This module is skipped with probability 75\%.
+%\vspace{.9cm}
+\end{minipage}
+%\vspace{-.7cm}
+
+\vspace{1mm}
+
+\begin{minipage}[t]{\linewidth}
+\begin{wrapfigure}[7]{l}{0.14\textwidth}
+%\begin{minipage}[t]{0.14\linewidth}
+%\centering
+\begin{center}
+\vspace*{-4mm}
+\hspace*{-1mm}\includegraphics[scale=.4]{images/Rature_only.png}\\
+{\bf Scratches}
+%\end{minipage}%
+\end{center}
+\end{wrapfigure}
+%\hspace{0.3cm}\begin{minipage}[t]{0.86\linewidth}
+%\vspace{.4cm}
+The {\bf scratches} module places line-like white patches on the image.  The
+lines are heavily transformed images of the digit ``1'' (one), chosen
+at random among 500 such 1 images,
+randomly cropped and rotated by an angle $\sim Normal(0,(100 \times
+complexity)^2$ (in degrees), using bi-cubic interpolation.
+Two passes of a grey-scale morphological erosion filter
+are applied, reducing the width of the line
+by an amount controlled by $complexity$.
+This module is skipped with probability 85\%. The probabilities
+of applying 1, 2, or 3 patches are (50\%,30\%,20\%).
+\end{minipage}
+
+\vspace*{1mm}
+
+\begin{minipage}[t]{0.25\linewidth}
+\centering
+\hspace*{-16mm}\includegraphics[scale=.4]{images/Contrast_only.png}\\
+{\bf Grey Level \& Contrast}
+\end{minipage}%
+\hspace{-12mm}\begin{minipage}[t]{0.82\linewidth}
+\vspace*{-18mm}
+The {\bf grey level and contrast} module changes the contrast by changing grey levels, and may invert the image polarity (white
+to black and black to white). The contrast is $C \sim U[1-0.85 \times complexity,1]$ 
+so the image is normalized into $[\frac{1-C}{2},1-\frac{1-C}{2}]$. The
+polarity is inverted with probability 50\%.
+%\vspace{.7cm}
+\end{minipage}
+\vspace{2mm}
+
+\iffalse
+\begin{figure}[ht]
+\centerline{\resizebox{.9\textwidth}{!}{\includegraphics{images/example_t.png}}}\\
+\caption{Illustration of the pipeline of stochastic 
+transformations applied to the image of a lower-case \emph{t}
+(the upper left image). Each image in the pipeline (going from
+left to right, first top line, then bottom line) shows the result
+of applying one of the modules in the pipeline. The last image
+(bottom right) is used as training example.}
+\label{fig:pipeline}
+\end{figure}
+\fi
+
+\vspace*{-3mm}
+\section{Experimental Setup}
+\vspace*{-1mm}
+
+Much previous work on deep learning had been performed on
+the MNIST digits task
+with 60~000 examples, and variants involving 10~000
+examples~\citep{VincentPLarochelleH2008-very-small}.
+The focus here is on much larger training sets, from 10 times to 
+to 1000 times larger, and 62 classes.
+
+The first step in constructing the larger datasets (called NISTP and P07) is to sample from
+a {\em data source}: {\bf NIST} (NIST database 19), {\bf Fonts}, {\bf Captchas},
+and {\bf OCR data} (scanned machine printed characters). Once a character
+is sampled from one of these {\em data sources} (chosen randomly), the second step is to
+apply a pipeline of transformations and/or noise processes described in section \ref{s:perturbations}.
+
+To provide a baseline of error rate comparison we also estimate human performance
+on both the 62-class task and the 10-class digits task.
+We compare the best Multi-Layer Perceptrons (MLP) against
+the best Stacked Denoising Auto-encoders (SDA), when
+both models' hyper-parameters are selected to minimize the validation set error.
+We also provide a comparison against a precise estimate
+of human performance obtained via Amazon's Mechanical Turk (AMT)
+service ({\tt http://mturk.com}). 
+AMT users are paid small amounts
+of money to perform tasks for which human intelligence is required.
+An incentive for them to do the job right is that payment can be denied
+if the job is not properly done.
+Mechanical Turk has been used extensively in natural language processing and vision.
+%processing \citep{SnowEtAl2008} and vision
+%\citep{SorokinAndForsyth2008,whitehill09}. 
+AMT users were presented
+with 10 character images at a time (from a test set) and asked to choose 10 corresponding ASCII
+characters. They were forced to choose a single character class (either among the
+62 or 10 character classes) for each image.
+80 subjects classified 2500 images per (dataset,task) pair.
+Different humans labelers sometimes provided a different label for the same
+example, and we were able to estimate the error variance due to this effect
+because each image was classified by 3 different persons. 
+The average error of humans on the 62-class task NIST test set
+is 18.2\%, with a standard error of 0.1\%.
+
+\vspace*{-3mm}
+\subsection{Data Sources}
+\vspace*{-2mm}
+
+%\begin{itemize}
+%\item 
+{\bf NIST.}
+Our main source of characters is the NIST Special Database 19~\citep{Grother-1995}, 
+widely used for training and testing character
+recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}. 
+The dataset is composed of 814255 digits and characters (upper and lower cases), with hand checked classifications,
+extracted from handwritten sample forms of 3600 writers. The characters are labelled by one of the 62 classes 
+corresponding to ``0''-``9'',``A''-``Z'' and ``a''-``z''. The dataset contains 8 parts (partitions) of varying complexity. 
+The fourth partition (called $hsf_4$, 82587 examples), 
+experimentally recognized to be the most difficult one, is the one recommended 
+by NIST as a testing set and is used in our work as well as some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+for that purpose. We randomly split the remainder (731,668 examples) into a training set and a validation set for
+model selection. 
+The performances reported by previous work on that dataset mostly use only the digits.
+Here we use all the classes both in the training and testing phase. This is especially
+useful to estimate the effect of a multi-task setting.
+The distribution of the classes in the NIST training and test sets differs
+substantially, with relatively many more digits in the test set, and a more uniform distribution
+of letters in the test set (whereas in the training set they are distributed
+more like in natural text).
+\vspace*{-1mm}
+
+%\item 
+{\bf Fonts.} 
+In order to have a good variety of sources we downloaded an important number of free fonts from:
+{\tt http://cg.scs.carleton.ca/\textasciitilde luc/freefonts.html}.
+% TODO: pointless to anonymize, it's not pointing to our work
+Including an operating system's (Windows 7) fonts, there is a total of $9817$ different fonts that we can choose uniformly from.
+The chosen {\tt ttf} file is either used as input of the Captcha generator (see next item) or, by producing a corresponding image, 
+directly as input to our models.
+\vspace*{-1mm}
+
+%\item 
+{\bf Captchas.}
+The Captcha data source is an adaptation of the \emph{pycaptcha} library (a Python-based captcha generator library) for 
+generating characters of the same format as the NIST dataset. This software is based on
+a random character class generator and various kinds of transformations similar to those described in the previous sections. 
+In order to increase the variability of the data generated, many different fonts are used for generating the characters. 
+Transformations (slant, distortions, rotation, translation) are applied to each randomly generated character with a complexity
+depending on the value of the complexity parameter provided by the user of the data source. 
+%Two levels of complexity are allowed and can be controlled via an easy to use facade class. %TODO: what's a facade class?
+\vspace*{-1mm}
+
+%\item 
+{\bf OCR data.}
+A large set (2 million) of scanned, OCRed and manually verified machine-printed 
+characters where included as an
+additional source. This set is part of a larger corpus being collected by the Image Understanding
+Pattern Recognition Research group led by Thomas Breuel at University of Kaiserslautern 
+({\tt http://www.iupr.com}), and which will be publicly released.
+%TODO: let's hope that Thomas is not a reviewer! :) Seriously though, maybe we should anonymize this
+%\end{itemize}
+
+\vspace*{-3mm}
+\subsection{Data Sets}
+\vspace*{-2mm}
+
+All data sets contain 32$\times$32 grey-level images (values in $[0,1]$) associated with a label
+from one of the 62 character classes. They are obtained from the optional application of the
+perturbation pipeline to iid samples from the datasources, and they are randomly split into 
+training set, validation set, and test set.
+%\begin{itemize}
+\vspace*{-1mm}
+
+%\item 
+{\bf NIST.} This is the raw NIST special database 19~\citep{Grother-1995}. It has
+\{651668 / 80000 / 82587\} \{training / validation / test\} examples, containing
+upper case, lower case, and digits.
+\vspace*{-1mm}
+
+%\item 
+{\bf P07.} This dataset of upper case, lower case and digit images
+is obtained by taking raw characters from all four of the above sources
+and sending them through the transformation pipeline described in section \ref{s:perturbations}.
+For each new example to generate, a data source is selected with probability $10\%$ from the fonts,
+$25\%$ from the captchas, $25\%$ from the OCR data and $40\%$ from NIST. We apply all the transformations in the
+order given above, and for each of them we sample uniformly a \emph{complexity} in the range $[0,0.7]$.
+It has \{81920000 / 80000 / 20000\} \{training / validation / test\} examples.
+\vspace*{-1mm}
+
+%\item 
+{\bf NISTP.} This one is equivalent to P07 (complexity parameter of $0.7$ with the same proportions of data sources)
+  except that we only apply
+  transformations from slant to pinch. Therefore, the character is
+  transformed but no additional noise is added to the image, giving images
+  closer to the NIST dataset. 
+It has \{81,920,000 / 80,000 / 20,000\} \{training / validation / test\} examples
+obtained from the corresponding NIST sets plus other sources.
+%\end{itemize}
+
+\vspace*{-3mm}
+\subsection{Models and their Hyperparameters}
+\vspace*{-2mm}
+
+The experiments are performed using MLPs (with a single
+hidden layer) and deep SDAs.
+\emph{Hyper-parameters are selected based on the {\bf NISTP} validation set error.}
+
+{\bf Multi-Layer Perceptrons (MLP).}
+Whereas previous work had compared deep architectures to both shallow MLPs and
+SVMs, we only compared to MLPs here because of the very large datasets used
+(making the use of SVMs computationally challenging because of their quadratic
+scaling behavior). Preliminary experiments on training SVMs (libSVM) with subsets of the training
+set allowing the program to fit in memory yielded substantially worse results
+than those obtained with MLPs. For training on nearly a hundred million examples
+(with the perturbed data), the MLPs and SDA are much more convenient than
+classifiers based on kernel methods.
+The MLP has a single hidden layer with $\tanh$ activation functions, and softmax (normalized
+exponentials) on the output layer for estimating $P(class | image)$.
+The number of hidden units is taken in $\{300,500,800,1000,1500\}$. 
+Training examples are presented in minibatches of size 20. A constant learning
+rate was chosen among $\{0.001, 0.01, 0.025, 0.075, 0.1, 0.5\}$.
+%through preliminary experiments (measuring performance on a validation set),
+%and $0.1$ (which was found to work best) was then selected for optimizing on
+%the whole training sets.
+\vspace*{-1mm}
+
+
+{\bf Stacked Denoising Auto-encoders (SDA).}
+Various auto-encoder variants and Restricted Boltzmann Machines (RBMs)
+can be used to initialize the weights of each layer of a deep MLP (with many hidden 
+layers)
+apparently setting parameters in the
+basin of attraction of supervised gradient descent yielding better 
+generalization~\citep{Erhan+al-2010}.  This initial {\em unsupervised
+pre-training phase} uses all of the training images but not the training labels.
+Each layer is trained in turn to produce a new representation of its input
+(starting from the raw pixels).
+It is hypothesized that the
+advantage brought by this procedure stems from a better prior,
+on the one hand taking advantage of the link between the input
+distribution $P(x)$ and the conditional distribution of interest
+$P(y|x)$ (like in semi-supervised learning), and on the other hand
+taking advantage of the expressive power and bias implicit in the
+deep architecture (whereby complex concepts are expressed as
+compositions of simpler ones through a deep hierarchy).
+
+\iffalse
+\begin{figure}[ht]
+\vspace*{-2mm}
+\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}}
+\vspace*{-2mm}
+\caption{Illustration of the computations and training criterion for the denoising
+auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of
+the layer (i.e. raw input or output of previous layer)
+s corrupted into $\tilde{x}$ and encoded into code $y$ by the encoder $f_\theta(\cdot)$.
+The decoder $g_{\theta'}(\cdot)$ maps $y$ to reconstruction $z$, which
+is compared to the uncorrupted input $x$ through the loss function
+$L_H(x,z)$, whose expected value is approximately minimized during training
+by tuning $\theta$ and $\theta'$.}
+\label{fig:da}
+\vspace*{-2mm}
+\end{figure}
+\fi
+
+Here we chose to use the Denoising
+Auto-encoder~\citep{VincentPLarochelleH2008-very-small} as the building block for
+these deep hierarchies of features, as it is simple to train and
+explain (see % Figure~\ref{fig:da}, as well as 
+tutorial and code there: {\tt http://deeplearning.net/tutorial}), 
+provides efficient inference, and yielded results
+comparable or better than RBMs in series of experiments
+\citep{VincentPLarochelleH2008-very-small}. It really corresponds to a Gaussian
+RBM trained by a Score Matching criterion~\cite{Vincent-SM-2010}.
+During training, a Denoising
+Auto-encoder is presented with a stochastically corrupted version
+of the input and trained to reconstruct the uncorrupted input,
+forcing the hidden units to represent the leading regularities in
+the data. Here we use the random binary masking corruption
+(which sets to 0 a random subset of the inputs).
+ Once it is trained, in a purely unsupervised way, 
+its hidden units' activations can
+be used as inputs for training a second one, etc.
+After this unsupervised pre-training stage, the parameters
+are used to initialize a deep MLP, which is fine-tuned by
+the same standard procedure used to train them (see previous section).
+The SDA hyper-parameters are the same as for the MLP, with the addition of the
+amount of corruption noise (we used the masking noise process, whereby a
+fixed proportion of the input values, randomly selected, are zeroed), and a
+separate learning rate for the unsupervised pre-training stage (selected
+from the same above set). The fraction of inputs corrupted was selected
+among $\{10\%, 20\%, 50\%\}$. Another hyper-parameter is the number
+of hidden layers but it was fixed to 3 based on previous work with
+SDAs on MNIST~\citep{VincentPLarochelleH2008-very-small}. The size of the hidden
+layers was kept constant across hidden layers, and the best results
+were obtained with the largest values that we could experiment
+with given our patience, with 1000 hidden units.
+
+\vspace*{-1mm}
+
+\begin{figure}[ht]
+\vspace*{-2mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}}
+\vspace*{-3mm}
+\caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained
+on NIST, 1 on NISTP, and 2 on P07. Left: overall results
+of all models, on NIST and NISTP test sets.
+Right: error rates on NIST test digits only, along with the previous results from 
+literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+respectively based on ART, nearest neighbors, MLPs, and SVMs.}
+\label{fig:error-rates-charts}
+\vspace*{-2mm}
+\end{figure}
+
+
+\begin{figure}[ht]
+\vspace*{-3mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}}
+\vspace*{-3mm}
+\caption{Relative improvement in error rate due to self-taught learning. 
+Left: Improvement (or loss, when negative)
+induced by out-of-distribution examples (perturbed data). 
+Right: Improvement (or loss, when negative) induced by multi-task 
+learning (training on all classes and testing only on either digits,
+upper case, or lower-case). The deep learner (SDA) benefits more from
+both self-taught learning scenarios, compared to the shallow MLP.}
+\label{fig:improvements-charts}
+\vspace*{-2mm}
+\end{figure}
+
+\section{Experimental Results}
+\vspace*{-2mm}
+
+%\vspace*{-1mm}
+%\subsection{SDA vs MLP vs Humans}
+%\vspace*{-1mm}
+The models are either trained on NIST (MLP0 and SDA0), 
+NISTP (MLP1 and SDA1), or P07 (MLP2 and SDA2), and tested
+on either NIST, NISTP or P07 (regardless of the data set used for training),
+either on the 62-class task
+or on the 10-digits task. Training time (including about half
+for unsupervised pre-training, for DAs) on the larger
+datasets is around one day on a GPU (GTX 285).
+Figure~\ref{fig:error-rates-charts} summarizes the results obtained,
+comparing humans, the three MLPs (MLP0, MLP1, MLP2) and the three SDAs (SDA0, SDA1,
+SDA2), along with the previous results on the digits NIST special database
+19 test set from the literature, respectively based on ARTMAP neural
+networks ~\citep{Granger+al-2007}, fast nearest-neighbor search
+~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs
+~\citep{Milgram+al-2005}.%  More detailed and complete numerical results
+%(figures and tables, including standard errors on the error rates) can be
+%found in Appendix.
+The deep learner not only outperformed the shallow ones and
+previously published performance (in a statistically and qualitatively
+significant way) but when trained with perturbed data
+reaches human performance on both the 62-class task
+and the 10-class (digits) task. 
+17\% error (SDA1) or 18\% error (humans) may seem large but a large
+majority of the errors from humans and from SDA1 are from out-of-context
+confusions (e.g. a vertical bar can be a ``1'', an ``l'' or an ``L'', and a
+``c'' and a ``C'' are often indistinguishible).
+
+In addition, as shown in the left of
+Figure~\ref{fig:improvements-charts}, the relative improvement in error
+rate brought by self-taught learning is greater for the SDA, and these
+differences with the MLP are statistically and qualitatively
+significant. 
+The left side of the figure shows the improvement to the clean
+NIST test set error brought by the use of out-of-distribution examples
+(i.e. the perturbed examples examples from NISTP or P07),
+over the models trained exclusively on NIST (respectively SDA0 and MLP0).
+Relative percent change is measured by taking
+$100 \% \times$ (original model's error / perturbed-data model's error - 1).
+The right side of
+Figure~\ref{fig:improvements-charts} shows the relative improvement
+brought by the use of a multi-task setting, in which the same model is
+trained for more classes than the target classes of interest (i.e. training
+with all 62 classes when the target classes are respectively the digits,
+lower-case, or upper-case characters). Again, whereas the gain from the
+multi-task setting is marginal or negative for the MLP, it is substantial
+for the SDA.  Note that to simplify these multi-task experiments, only the original
+NIST dataset is used. For example, the MLP-digits bar shows the relative
+percent improvement in MLP error rate on the NIST digits test set 
+is $100\% \times$ (single-task
+model's error / multi-task model's error - 1).  The single-task model is
+trained with only 10 outputs (one per digit), seeing only digit examples,
+whereas the multi-task model is trained with 62 outputs, with all 62
+character classes as examples.  Hence the hidden units are shared across
+all tasks.  For the multi-task model, the digit error rate is measured by
+comparing the correct digit class with the output class associated with the
+maximum conditional probability among only the digit classes outputs.  The
+setting is similar for the other two target classes (lower case characters
+and upper case characters). Note however that some types of perturbations
+(NISTP) help more than others (P07) when testing on the clean images.
+%%\vspace*{-1mm}
+%\subsection{Perturbed Training Data More Helpful for SDA}
+%\vspace*{-1mm}
+
+%\vspace*{-1mm}
+%\subsection{Multi-Task Learning Effects}
+%\vspace*{-1mm}
+
+\iffalse
+As previously seen, the SDA is better able to benefit from the
+transformations applied to the data than the MLP. In this experiment we
+define three tasks: recognizing digits (knowing that the input is a digit),
+recognizing upper case characters (knowing that the input is one), and
+recognizing lower case characters (knowing that the input is one).  We
+consider the digit classification task as the target task and we want to
+evaluate whether training with the other tasks can help or hurt, and
+whether the effect is different for MLPs versus SDAs.  The goal is to find
+out if deep learning can benefit more (or less) from multiple related tasks
+(i.e. the multi-task setting) compared to a corresponding purely supervised
+shallow learner.
+
+We use a single hidden layer MLP with 1000 hidden units, and a SDA
+with 3 hidden layers (1000 hidden units per layer), pre-trained and
+fine-tuned on NIST.
+
+Our results show that the MLP benefits marginally from the multi-task setting
+in the case of digits (5\% relative improvement) but is actually hurt in the case
+of characters (respectively 3\% and 4\% worse for lower and upper class characters).
+On the other hand the SDA benefited from the multi-task setting, with relative
+error rate improvements of 27\%, 15\% and 13\% respectively for digits,
+lower and upper case characters, as shown in Table~\ref{tab:multi-task}.
+\fi
+
+
+\vspace*{-2mm}
+\section{Conclusions and Discussion}
+\vspace*{-2mm}
+
+We have found that the self-taught learning framework is more beneficial
+to a deep learner than to a traditional shallow and purely
+supervised learner. More precisely, 
+the answers are positive for all the questions asked in the introduction.
+%\begin{itemize}
+
+$\bullet$ %\item 
+{\bf Do the good results previously obtained with deep architectures on the
+MNIST digits generalize to a much larger and richer (but similar)
+dataset, the NIST special database 19, with 62 classes and around 800k examples}?
+Yes, the SDA {\em systematically outperformed the MLP and all the previously
+published results on this dataset} (the ones that we are aware of), {\em in fact reaching human-level
+performance} at around 17\% error on the 62-class task and 1.4\% on the digits,
+and beating previously published results on the same data.
+
+$\bullet$ %\item 
+{\bf To what extent do self-taught learning scenarios help deep learners,
+and do they help them more than shallow supervised ones}?
+We found that distorted training examples not only made the resulting
+classifier better on similarly perturbed images but also on
+the {\em original clean examples}, and more importantly and more novel,
+that deep architectures benefit more from such {\em out-of-distribution}
+examples. MLPs were helped by perturbed training examples when tested on perturbed input 
+images (65\% relative improvement on NISTP) 
+but only marginally helped (5\% relative improvement on all classes) 
+or even hurt (10\% relative loss on digits)
+with respect to clean examples. On the other hand, the deep SDAs
+were significantly boosted by these out-of-distribution examples.
+Similarly, whereas the improvement due to the multi-task setting was marginal or
+negative for the MLP (from +5.6\% to -3.6\% relative change), 
+it was quite significant for the SDA (from +13\% to +27\% relative change),
+which may be explained by the arguments below.
+%\end{itemize}
+
+In the original self-taught learning framework~\citep{RainaR2007}, the
+out-of-sample examples were used as a source of unsupervised data, and
+experiments showed its positive effects in a \emph{limited labeled data}
+scenario. However, many of the results by \citet{RainaR2007} (who used a
+shallow, sparse coding approach) suggest that the {\em relative gain of self-taught
+learning vs ordinary supervised learning} diminishes as the number of labeled examples increases.
+We note instead that, for deep
+architectures, our experiments show that such a positive effect is accomplished
+even in a scenario with a \emph{large number of labeled examples},
+i.e., here, the relative gain of self-taught learning and
+out-of-distribution examples is probably preserved
+in the asymptotic regime. However, note that in our perturbation experiments
+(but not in our multi-task experiments), 
+even the out-of-distribution examples are labeled, unlike in the
+earlier self-taught learning experiments~\citep{RainaR2007}.
+
+{\bf Why would deep learners benefit more from the self-taught learning framework}?
+The key idea is that the lower layers of the predictor compute a hierarchy
+of features that can be shared across tasks or across variants of the
+input distribution. A theoretical analysis of generalization improvements
+due to sharing of intermediate features across tasks already points
+towards that explanation~\cite{baxter95a}.
+Intermediate features that can be used in different
+contexts can be estimated in a way that allows to share statistical 
+strength. Features extracted through many levels are more likely to
+be more abstract and more invariant to some of the factors of variation
+in the underlying distribution (as the experiments in~\citet{Goodfellow2009} suggest),
+increasing the likelihood that they would be useful for a larger array
+of tasks and input conditions.
+Therefore, we hypothesize that both depth and unsupervised
+pre-training play a part in explaining the advantages observed here, and future
+experiments could attempt at teasing apart these factors.
+And why would deep learners benefit from the self-taught learning
+scenarios even when the number of labeled examples is very large?
+We hypothesize that this is related to the hypotheses studied
+in~\citet{Erhan+al-2010}. In~\citet{Erhan+al-2010}
+it was found that online learning on a huge dataset did not make the
+advantage of the deep learning bias vanish, and a similar phenomenon
+may be happening here. We hypothesize that unsupervised pre-training
+of a deep hierarchy with self-taught learning initializes the
+model in the basin of attraction of supervised gradient descent
+that corresponds to better generalization. Furthermore, such good
+basins of attraction are not discovered by pure supervised learning
+(with or without self-taught settings), and more labeled examples
+does not allow the model to go from the poorer basins of attraction discovered
+by the purely supervised shallow models to the kind of better basins associated
+with deep learning and self-taught learning.
+
+A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 
+can be executed on-line at {\tt http://deep.host22.com}.
+
+%\newpage
+{
+\bibliography{strings,strings-short,strings-shorter,ift6266_ml,aigaion-shorter,specials}
+%\bibliographystyle{plainnat}
+\bibliographystyle{unsrtnat}
+%\bibliographystyle{apalike}
+}
+
+
+\end{document}
Binary file writeup/nips2010_ift6266_poster.odg has changed
Binary file writeup/nips2010_submission.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/nipswp_submission.tex	Sat Mar 19 22:51:40 2011 -0400
@@ -0,0 +1,767 @@
+%\documentclass[twoside,11pt]{article} % For LaTeX2e
+\documentclass{article} % For LaTeX2e
+\usepackage{nips10submit_e}
+\usepackage{times}
+\usepackage{wrapfig}
+\usepackage{amsthm}
+\usepackage{amsmath}
+\usepackage{bbm}
+\usepackage[utf8]{inputenc}
+\usepackage[psamsfonts]{amssymb}
+%\usepackage{algorithm,algorithmic} % not used after all
+\usepackage{graphicx,subfigure}
+\usepackage[numbers]{natbib}
+
+\addtolength{\textwidth}{10mm}
+\addtolength{\evensidemargin}{-5mm}
+\addtolength{\oddsidemargin}{-5mm}
+
+%\setlength\parindent{0mm}
+
+\begin{document}
+
+\title{Deep Self-Taught Learning for Handwritten Character Recognition}
+\author{
+Yoshua  Bengio \and
+Frédéric  Bastien \and
+Arnaud  Bergeron \and
+Nicolas  Boulanger-Lewandowski \and
+Thomas  Breuel \and
+Youssouf  Chherawala \and
+Moustapha  Cisse \and 
+Myriam  Côté \and 
+Dumitru  Erhan \and
+Jeremy  Eustache \and
+Xavier  Glorot \and 
+Xavier  Muller \and
+Sylvain  Pannetier Lebeuf \and
+Razvan  Pascanu \and 
+Salah  Rifai \and 
+Francois  Savard \and 
+Guillaume  Sicard 
+}
+\date{{\tt bengioy@iro.umontreal.ca}, Dept. IRO, U. Montreal, P.O. Box 6128, Centre-Ville branch, H3C 3J7, Montreal (Qc), Canada}
+%\jmlrheading{}{2010}{}{10/2010}{XX/2011}{Yoshua Bengio et al}
+%\editor{}
+
+%\makeanontitle
+\maketitle
+
+%{\bf Running title: Deep Self-Taught Learning}
+
+\vspace*{-2mm}
+\begin{abstract}
+  Recent theoretical and empirical work in statistical machine learning has demonstrated the potential of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple levels of representation. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}.  For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set.  We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in a large-scale handwritten character recognition setting. In fact, we show that they {\em beat previously published results and reach human-level performance}.
+\end{abstract}
+\vspace*{-3mm}
+
+%\begin{keywords}  
+%Deep learning, self-taught learning, out-of-distribution examples, handwritten character recognition, multi-task learning
+%\end{keywords}
+%\keywords{self-taught learning \and multi-task learning \and out-of-distribution examples \and deep learning \and handwriting recognition}
+
+
+
+\section{Introduction}
+\vspace*{-1mm}
+
+{\bf Deep Learning} has emerged as a promising new area of research in
+statistical machine learning~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,VincentPLarochelleH2008,ranzato-08,TaylorHintonICML2009,Larochelle-jmlr-2009,Salakhutdinov+Hinton-2009,HonglakL2009,HonglakLNIPS2009,Jarrett-ICCV2009,Taylor-cvpr-2010}. See \citet{Bengio-2009} for a review.
+Learning algorithms for deep architectures are centered on the learning
+of useful representations of data, which are better suited to the task at hand,
+and are organized in a hierarchy with multiple levels.
+This is in part inspired by observations of the mammalian visual cortex, 
+which consists of a chain of processing elements, each of which is associated with a
+different representation of the raw visual input. In fact,
+it was found recently that the features learnt in deep architectures resemble
+those observed in the first two of these stages (in areas V1 and V2
+of visual cortex) \citep{HonglakL2008}, and that they become more and
+more invariant to factors of variation (such as camera movement) in
+higher layers~\citep{Goodfellow2009}.
+Learning a hierarchy of features increases the
+ease and practicality of developing representations that are at once
+tailored to specific tasks, yet are able to borrow statistical strength
+from other related tasks (e.g., modeling different kinds of objects). Finally, learning the
+feature representation can lead to higher-level (more abstract, more
+general) features that are more robust to unanticipated sources of
+variance extant in real data.
+
+{\bf Self-taught learning}~\citep{RainaR2007} is a paradigm that combines principles
+of semi-supervised and multi-task learning: the learner can exploit examples
+that are unlabeled and possibly come from a distribution different from the target
+distribution, e.g., from other classes than those of interest. 
+It has already been shown that deep learners can clearly take advantage of
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small},
+but more needs to be done to explore the impact
+of {\em out-of-distribution} examples and of the {\em multi-task} setting
+(one exception is~\citep{CollobertR2008}, which uses a different kind
+of learning algorithm). In particular the {\em relative
+advantage of deep learning} for these settings has not been evaluated.
+The hypothesis discussed in the conclusion is that in the context of 
+multi-task learning and the availability of out-of-distribution training examples, 
+a deep hierarchy of features
+may be better able to provide {\em sharing of statistical strength}
+between different regions in input space or different tasks, compared to
+a shallow learner.
+
+\iffalse
+Whereas a deep architecture can in principle be more powerful than a
+shallow one in terms of representation, depth appears to render the
+training problem more difficult in terms of optimization and local minima.
+It is also only recently that successful algorithms were proposed to
+overcome some of these difficulties.  All are based on unsupervised
+learning, often in an greedy layer-wise ``unsupervised pre-training''
+stage~\citep{Bengio-2009}.  One of these layer initialization techniques,
+applied here, is the Denoising
+Auto-encoder~(DA)~\citep{VincentPLarochelleH2008-very-small} (see Figure~\ref{fig:da}), 
+which
+performed similarly or better than previously proposed Restricted Boltzmann
+Machines in terms of unsupervised extraction of a hierarchy of features
+useful for classification. Each layer is trained to denoise its
+input, creating a layer of features that can be used as input for the next layer.  
+\fi
+
+%The principle is that each layer starting from
+%the bottom is trained to encode its input (the output of the previous
+%layer) and to reconstruct it from a corrupted version. After this
+%unsupervised initialization, the stack of DAs can be
+%converted into a deep supervised feedforward neural network and fine-tuned by
+%stochastic gradient descent.
+
+%
+The {\bf main claim} of this paper is that deep learners (with several levels of representation) can
+{\bf benefit more from self-taught learning than shallow learners} (with a single
+level), both in the context of the multi-task setting and from {\em
+  out-of-distribution examples} in general. Because we are able to improve on state-of-the-art
+performance and reach human-level performance
+on a large-scale task, we consider that this paper is also a contribution
+to advance the application of machine learning to handwritten character recognition.
+More precisely, we ask and answer the following questions:
+
+%\begin{enumerate}
+$\bullet$ %\item 
+Do the good results previously obtained with deep architectures on the
+MNIST digit images generalize to the setting of a similar but much larger and richer
+dataset, the NIST special database 19, with 62 classes and around 800k examples?
+
+$\bullet$ %\item 
+To what extent does the perturbation of input images (e.g. adding
+noise, affine transformations, background images) make the resulting
+classifiers better not only on similarly perturbed images but also on
+the {\em original clean examples}? We study this question in the
+context of the 62-class and 10-class tasks of the NIST special database 19.
+
+$\bullet$ %\item 
+Do deep architectures {\em benefit {\bf more} from such out-of-distribution}
+examples, i.e. do they benefit more from the self-taught learning~\citep{RainaR2007} framework?
+We use highly perturbed examples to generate out-of-distribution examples.
+
+$\bullet$ %\item 
+Similarly, does the feature learning step in deep learning algorithms benefit {\bf more}
+from training with moderately {\em different classes} (i.e. a multi-task learning scenario) than
+a corresponding shallow and purely supervised architecture?
+We train on 62 classes and test on 10 (digits) or 26 (upper case or lower case)
+to answer this question.
+%\end{enumerate}
+
+Our experimental results provide positive evidence towards all of these questions,
+as well as {\em classifiers that reach human-level performance on 62-class isolated character
+recognition and beat previously published results on the NIST dataset (special database 19)}.
+To achieve these results, we introduce in the next section a sophisticated system
+for stochastically transforming character images and then explain the methodology,
+which is based on training with or without these transformed images and testing on 
+clean ones. We measure the relative advantage of out-of-distribution examples
+(perturbed or out-of-class)
+for a deep learner vs a supervised shallow one.
+Code for generating these transformations as well as for the deep learning 
+algorithms are made available at {\tt http://anonymous.url.net}.%{\tt http://hg.assembla.com/ift6266}.
+We also estimate the relative advantage for deep learners of training with
+other classes than those of interest, by comparing learners trained with
+62 classes with learners trained with only a subset (on which they
+are then tested).
+The conclusion discusses
+the more general question of why deep learners may benefit so much from 
+the self-taught learning framework. Since out-of-distribution data
+(perturbed or from other related classes) is very common, this conclusion
+is of practical importance.
+
+\vspace*{-3mm}
+%\newpage
+\section{Perturbed and Transformed Character Images}
+\label{s:perturbations}
+\vspace*{-2mm}
+
+Figure~\ref{fig:transform} shows the different transformations we used to stochastically
+transform $32 \times 32$ source images (such as the one in Fig.\ref{fig:torig})
+in order to obtain data from a larger distribution which
+covers a domain substantially larger than the clean characters distribution from
+which we start.
+Although character transformations have been used before to
+improve character recognizers, this effort is on a large scale both
+in number of classes and in the complexity of the transformations, hence
+in the complexity of the learning task.
+The code for these transformations (mostly python) is available at 
+{\tt http://anonymous.url.net}. All the modules in the pipeline share
+a global control parameter ($0 \le complexity \le 1$) that allows one to modulate the
+amount of deformation or noise introduced. 
+There are two main parts in the pipeline. The first one,
+from slant to pinch below, performs transformations. The second
+part, from blur to contrast, adds different kinds of noise.
+More details can be found in~\citep{ift6266-tr-anonymous}.
+
+\begin{figure}[ht]
+\centering
+\subfigure[Original]{\includegraphics[scale=0.6]{images/Original.png}\label{fig:torig}}
+\subfigure[Thickness]{\includegraphics[scale=0.6]{images/Thick_only.png}}
+\subfigure[Slant]{\includegraphics[scale=0.6]{images/Slant_only.png}}
+\subfigure[Affine Transformation]{\includegraphics[scale=0.6]{images/Affine_only.png}}
+\subfigure[Local Elastic Deformation]{\includegraphics[scale=0.6]{images/Localelasticdistorsions_only.png}}
+\subfigure[Pinch]{\includegraphics[scale=0.6]{images/Pinch_only.png}}
+%Noise
+\subfigure[Motion Blur]{\includegraphics[scale=0.6]{images/Motionblur_only.png}}
+\subfigure[Occlusion]{\includegraphics[scale=0.6]{images/occlusion_only.png}}
+\subfigure[Gaussian Smoothing]{\includegraphics[scale=0.6]{images/Bruitgauss_only.png}}
+\subfigure[Pixels Permutation]{\includegraphics[scale=0.6]{images/Permutpixel_only.png}}
+\subfigure[Gaussian Noise]{\includegraphics[scale=0.6]{images/Distorsiongauss_only.png}}
+\subfigure[Background Image Addition]{\includegraphics[scale=0.6]{images/background_other_only.png}}
+\subfigure[Salt \& Pepper]{\includegraphics[scale=0.6]{images/Poivresel_only.png}}
+\subfigure[Scratches]{\includegraphics[scale=0.6]{images/Rature_only.png}}
+\subfigure[Grey Level \& Contrast]{\includegraphics[scale=0.6]{images/Contrast_only.png}}
+\caption{Top left (a): example original image. Others (b-o): examples of the effect
+of each transformation module taken separately. Actual perturbed examples are obtained by
+a pipeline of these, with random choices about which module to apply and how much perturbation
+to apply.}
+\label{fig:transform}
+\vspace*{-2mm}
+\end{figure}
+
+\vspace*{-3mm}
+\section{Experimental Setup}
+\vspace*{-1mm}
+
+Much previous work on deep learning had been performed on
+the MNIST digits task~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,Salakhutdinov+Hinton-2009},
+with 60~000 examples, and variants involving 10~000
+examples~\citep{Larochelle-jmlr-toappear-2008,VincentPLarochelleH2008}.
+The focus here is on much larger training sets, from 10 times to 
+to 1000 times larger, and 62 classes.
+
+The first step in constructing the larger datasets (called NISTP and P07) is to sample from
+a {\em data source}: {\bf NIST} (NIST database 19), {\bf Fonts}, {\bf Captchas},
+and {\bf OCR data} (scanned machine printed characters). Once a character
+is sampled from one of these sources (chosen randomly), the second step is to
+apply a pipeline of transformations and/or noise processes outlined in section \ref{s:perturbations}.
+
+To provide a baseline of error rate comparison we also estimate human performance
+on both the 62-class task and the 10-class digits task.
+We compare the best Multi-Layer Perceptrons (MLP) against
+the best Stacked Denoising Auto-encoders (SDA), when
+both models' hyper-parameters are selected to minimize the validation set error.
+We also provide a comparison against a precise estimate
+of human performance obtained via Amazon's Mechanical Turk (AMT)
+service ({\tt http://mturk.com}). 
+AMT users are paid small amounts
+of money to perform tasks for which human intelligence is required.
+Mechanical Turk has been used extensively in natural language processing and vision.
+%processing \citep{SnowEtAl2008} and vision
+%\citep{SorokinAndForsyth2008,whitehill09}. 
+AMT users were presented
+with 10 character images (from a test set) and asked to choose 10 corresponding ASCII
+characters. They were forced to choose a single character class (either among the
+62 or 10 character classes) for each image.
+80 subjects classified 2500 images per (dataset,task) pair.
+Different humans labelers sometimes provided a different label for the same
+example, and we were able to estimate the error variance due to this effect
+because each image was classified by 3 different persons. 
+The average error of humans on the 62-class task NIST test set
+is 18.2\%, with a standard error of 0.1\%.
+
+\vspace*{-3mm}
+\subsection{Data Sources}
+\vspace*{-2mm}
+
+%\begin{itemize}
+%\item 
+{\bf NIST.}
+Our main source of characters is the NIST Special Database 19~\citep{Grother-1995}, 
+widely used for training and testing character
+recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}. 
+The dataset is composed of 814255 digits and characters (upper and lower cases), with hand checked classifications,
+extracted from handwritten sample forms of 3600 writers. The characters are labelled by one of the 62 classes 
+corresponding to ``0''-``9'',``A''-``Z'' and ``a''-``z''. The dataset contains 8 parts (partitions) of varying complexity. 
+The fourth partition (called $hsf_4$, 82587 examples), 
+experimentally recognized to be the most difficult one, is the one recommended 
+by NIST as a testing set and is used in our work as well as some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+for that purpose. We randomly split the remainder (731668 examples) into a training set and a validation set for
+model selection. 
+The performances reported by previous work on that dataset mostly use only the digits.
+Here we use all the classes both in the training and testing phase. This is especially
+useful to estimate the effect of a multi-task setting.
+The distribution of the classes in the NIST training and test sets differs
+substantially, with relatively many more digits in the test set, and a more uniform distribution
+of letters in the test set (whereas in the training set they are distributed
+more like in natural text).
+%\vspace*{-1mm}
+
+%\item 
+{\bf Fonts.} 
+In order to have a good variety of sources we downloaded an important number of free fonts from:
+{\tt http://cg.scs.carleton.ca/\textasciitilde luc/freefonts.html}.
+% TODO: pointless to anonymize, it's not pointing to our work
+Including the operating system's (Windows 7) fonts, there is a total of $9817$ different fonts that we can choose uniformly from.
+The chosen {\tt ttf} file is either used as input of the Captcha generator (see next item) or, by producing a corresponding image, 
+directly as input to our models.
+\vspace*{-1mm}
+
+%\item 
+{\bf Captchas.}
+The Captcha data source is an adaptation of the \emph{pycaptcha} library (a python based captcha generator library) for 
+generating characters of the same format as the NIST dataset. This software is based on
+a random character class generator and various kinds of transformations similar to those described in the previous sections. 
+In order to increase the variability of the data generated, many different fonts are used for generating the characters. 
+Transformations (slant, distortions, rotation, translation) are applied to each randomly generated character with a complexity
+depending on the value of the complexity parameter provided by the user of the data source. 
+%Two levels of complexity are allowed and can be controlled via an easy to use facade class. %TODO: what's a facade class?
+\vspace*{-1mm}
+
+%\item 
+{\bf OCR data.}
+A large set (2 million) of scanned, OCRed and manually verified machine-printed 
+characters where included as an
+additional source. This set is part of a larger corpus being collected by the Image Understanding
+Pattern Recognition Research group led by Thomas Breuel at University of Kaiserslautern 
+({\tt http://www.iupr.com}), and which will be publicly released.
+%TODO: let's hope that Thomas is not a reviewer! :) Seriously though, maybe we should anonymize this
+%\end{itemize}
+
+\vspace*{-3mm}
+\subsection{Data Sets}
+\vspace*{-2mm}
+
+All data sets contain 32$\times$32 grey-level images (values in $[0,1]$) associated with a label
+from one of the 62 character classes.
+%\begin{itemize}
+\vspace*{-1mm}
+
+%\item 
+{\bf NIST.} This is the raw NIST special database 19~\citep{Grother-1995}. It has
+\{651668 / 80000 / 82587\} \{training / validation / test\} examples.
+\vspace*{-1mm}
+
+%\item 
+{\bf P07.} This dataset is obtained by taking raw characters from all four of the above sources
+and sending them through the transformation pipeline described in section \ref{s:perturbations}.
+For each new example to generate, a data source is selected with probability $10\%$ from the fonts,
+$25\%$ from the captchas, $25\%$ from the OCR data and $40\%$ from NIST. We apply all the transformations in the
+order given above, and for each of them we sample uniformly a \emph{complexity} in the range $[0,0.7]$.
+It has \{81920000 / 80000 / 20000\} \{training / validation / test\} examples.
+\vspace*{-1mm}
+
+%\item 
+{\bf NISTP.} This one is equivalent to P07 (complexity parameter of $0.7$ with the same proportions of data sources)
+  except that we only apply
+  transformations from slant to pinch. Therefore, the character is
+  transformed but no additional noise is added to the image, giving images
+  closer to the NIST dataset. 
+It has \{81920000 / 80000 / 20000\} \{training / validation / test\} examples.
+%\end{itemize}
+
+\vspace*{-3mm}
+\subsection{Models and their Hyperparameters}
+\vspace*{-2mm}
+
+The experiments are performed using MLPs (with a single
+hidden layer) and SDAs.
+\emph{Hyper-parameters are selected based on the {\bf NISTP} validation set error.}
+
+{\bf Multi-Layer Perceptrons (MLP).}
+Whereas previous work had compared deep architectures to both shallow MLPs and
+SVMs, we only compared to MLPs here because of the very large datasets used
+(making the use of SVMs computationally challenging because of their quadratic
+scaling behavior). Preliminary experiments on training SVMs (libSVM) with subsets of the training
+set allowing the program to fit in memory yielded substantially worse results
+than those obtained with MLPs. For training on nearly a billion examples
+(with the perturbed data), the MLPs and SDA are much more convenient than
+classifiers based on kernel methods.
+The MLP has a single hidden layer with $\tanh$ activation functions, and softmax (normalized
+exponentials) on the output layer for estimating $P(class | image)$.
+The number of hidden units is taken in $\{300,500,800,1000,1500\}$. 
+Training examples are presented in minibatches of size 20. A constant learning
+rate was chosen among $\{0.001, 0.01, 0.025, 0.075, 0.1, 0.5\}$.
+%through preliminary experiments (measuring performance on a validation set),
+%and $0.1$ (which was found to work best) was then selected for optimizing on
+%the whole training sets.
+\vspace*{-1mm}
+
+
+{\bf Stacked Denoising Auto-Encoders (SDA).}
+Various auto-encoder variants and Restricted Boltzmann Machines (RBMs)
+can be used to initialize the weights of each layer of a deep MLP (with many hidden 
+layers)~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006}, 
+apparently setting parameters in the
+basin of attraction of supervised gradient descent yielding better 
+generalization~\citep{Erhan+al-2010}.  This initial {\em unsupervised
+pre-training phase} uses all of the training images but not the training labels.
+Each layer is trained in turn to produce a new representation of its input
+(starting from the raw pixels).
+It is hypothesized that the
+advantage brought by this procedure stems from a better prior,
+on the one hand taking advantage of the link between the input
+distribution $P(x)$ and the conditional distribution of interest
+$P(y|x)$ (like in semi-supervised learning), and on the other hand
+taking advantage of the expressive power and bias implicit in the
+deep architecture (whereby complex concepts are expressed as
+compositions of simpler ones through a deep hierarchy).
+
+\begin{figure}[ht]
+\vspace*{-2mm}
+\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}}
+\vspace*{-2mm}
+\caption{Illustration of the computations and training criterion for the denoising
+auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of
+the layer (i.e. raw input or output of previous layer)
+s corrupted into $\tilde{x}$ and encoded into code $y$ by the encoder $f_\theta(\cdot)$.
+The decoder $g_{\theta'}(\cdot)$ maps $y$ to reconstruction $z$, which
+is compared to the uncorrupted input $x$ through the loss function
+$L_H(x,z)$, whose expected value is approximately minimized during training
+by tuning $\theta$ and $\theta'$.}
+\label{fig:da}
+\vspace*{-2mm}
+\end{figure}
+
+Here we chose to use the Denoising
+Auto-encoder~\citep{VincentPLarochelleH2008} as the building block for
+these deep hierarchies of features, as it is simple to train and
+explain (see Figure~\ref{fig:da}, as well as 
+tutorial and code there: {\tt http://deeplearning.net/tutorial}), 
+provides efficient inference, and yielded results
+comparable or better than RBMs in series of experiments
+\citep{VincentPLarochelleH2008}. During training, a Denoising
+Auto-encoder is presented with a stochastically corrupted version
+of the input and trained to reconstruct the uncorrupted input,
+forcing the hidden units to represent the leading regularities in
+the data. Here we use the random binary masking corruption
+(which sets to 0 a random subset of the inputs).
+ Once it is trained, in a purely unsupervised way, 
+its hidden units' activations can
+be used as inputs for training a second one, etc.
+After this unsupervised pre-training stage, the parameters
+are used to initialize a deep MLP, which is fine-tuned by
+the same standard procedure used to train them (see previous section).
+The SDA hyper-parameters are the same as for the MLP, with the addition of the
+amount of corruption noise (we used the masking noise process, whereby a
+fixed proportion of the input values, randomly selected, are zeroed), and a
+separate learning rate for the unsupervised pre-training stage (selected
+from the same above set). The fraction of inputs corrupted was selected
+among $\{10\%, 20\%, 50\%\}$. Another hyper-parameter is the number
+of hidden layers but it was fixed to 3 based on previous work with
+SDAs on MNIST~\citep{VincentPLarochelleH2008}. The size of the hidden
+layers was kept constant across hidden layers, and the best results
+were obtained with the largest values that we could experiment
+with given our patience, with 1000 hidden units.
+
+\vspace*{-1mm}
+
+\begin{figure}[ht]
+%\vspace*{-2mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}}
+%\vspace*{-3mm}
+\caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained
+on NIST, 1 on NISTP, and 2 on P07. Left: overall results
+of all models, on NIST and NISTP test sets.
+Right: error rates on NIST test digits only, along with the previous results from 
+literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}
+respectively based on ART, nearest neighbors, MLPs, and SVMs.}
+\label{fig:error-rates-charts}
+\vspace*{-2mm}
+\end{figure}
+
+
+\begin{figure}[ht]
+\vspace*{-3mm}
+\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}}
+\vspace*{-3mm}
+\caption{Relative improvement in error rate due to self-taught learning. 
+Left: Improvement (or loss, when negative)
+induced by out-of-distribution examples (perturbed data). 
+Right: Improvement (or loss, when negative) induced by multi-task 
+learning (training on all classes and testing only on either digits,
+upper case, or lower-case). The deep learner (SDA) benefits more from
+both self-taught learning scenarios, compared to the shallow MLP.}
+\label{fig:improvements-charts}
+\vspace*{-2mm}
+\end{figure}
+
+\section{Experimental Results}
+\vspace*{-2mm}
+
+%%\vspace*{-1mm}
+%\subsection{SDA vs MLP vs Humans}
+%%\vspace*{-1mm}
+The models are either trained on NIST (MLP0 and SDA0), 
+NISTP (MLP1 and SDA1), or P07 (MLP2 and SDA2), and tested
+on either NIST, NISTP or P07, either on the 62-class task
+or on the 10-digits task. Training (including about half
+for unsupervised pre-training, for DAs) on the larger
+datasets takes around one day on a GPU-285.
+Figure~\ref{fig:error-rates-charts} summarizes the results obtained,
+comparing humans, the three MLPs (MLP0, MLP1, MLP2) and the three SDAs (SDA0, SDA1,
+SDA2), along with the previous results on the digits NIST special database
+19 test set from the literature, respectively based on ARTMAP neural
+networks ~\citep{Granger+al-2007}, fast nearest-neighbor search
+~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs
+~\citep{Milgram+al-2005}.%  More detailed and complete numerical results
+%(figures and tables, including standard errors on the error rates) can be
+%found in Appendix.
+The deep learner not only outperformed the shallow ones and
+previously published performance (in a statistically and qualitatively
+significant way) but when trained with perturbed data
+reaches human performance on both the 62-class task
+and the 10-class (digits) task. 
+17\% error (SDA1) or 18\% error (humans) may seem large but a large
+majority of the errors from humans and from SDA1 are from out-of-context
+confusions (e.g. a vertical bar can be a ``1'', an ``l'' or an ``L'', and a
+``c'' and a ``C'' are often indistinguishible).
+
+In addition, as shown in the left of
+Figure~\ref{fig:improvements-charts}, the relative improvement in error
+rate brought by self-taught learning is greater for the SDA, and these
+differences with the MLP are statistically and qualitatively
+significant. 
+The left side of the figure shows the improvement to the clean
+NIST test set error brought by the use of out-of-distribution examples
+(i.e. the perturbed examples examples from NISTP or P07). 
+Relative percent change is measured by taking
+$100 \% \times$ (original model's error / perturbed-data model's error - 1).
+The right side of
+Figure~\ref{fig:improvements-charts} shows the relative improvement
+brought by the use of a multi-task setting, in which the same model is
+trained for more classes than the target classes of interest (i.e. training
+with all 62 classes when the target classes are respectively the digits,
+lower-case, or upper-case characters). Again, whereas the gain from the
+multi-task setting is marginal or negative for the MLP, it is substantial
+for the SDA.  Note that to simplify these multi-task experiments, only the original
+NIST dataset is used. For example, the MLP-digits bar shows the relative
+percent improvement in MLP error rate on the NIST digits test set 
+is $100\% \times$ (single-task
+model's error / multi-task model's error - 1).  The single-task model is
+trained with only 10 outputs (one per digit), seeing only digit examples,
+whereas the multi-task model is trained with 62 outputs, with all 62
+character classes as examples.  Hence the hidden units are shared across
+all tasks.  For the multi-task model, the digit error rate is measured by
+comparing the correct digit class with the output class associated with the
+maximum conditional probability among only the digit classes outputs.  The
+setting is similar for the other two target classes (lower case characters
+and upper case characters).
+%%\vspace*{-1mm}
+%\subsection{Perturbed Training Data More Helpful for SDA}
+%%\vspace*{-1mm}
+
+%%\vspace*{-1mm}
+%\subsection{Multi-Task Learning Effects}
+%%\vspace*{-1mm}
+
+\iffalse
+As previously seen, the SDA is better able to benefit from the
+transformations applied to the data than the MLP. In this experiment we
+define three tasks: recognizing digits (knowing that the input is a digit),
+recognizing upper case characters (knowing that the input is one), and
+recognizing lower case characters (knowing that the input is one).  We
+consider the digit classification task as the target task and we want to
+evaluate whether training with the other tasks can help or hurt, and
+whether the effect is different for MLPs versus SDAs.  The goal is to find
+out if deep learning can benefit more (or less) from multiple related tasks
+(i.e. the multi-task setting) compared to a corresponding purely supervised
+shallow learner.
+
+We use a single hidden layer MLP with 1000 hidden units, and a SDA
+with 3 hidden layers (1000 hidden units per layer), pre-trained and
+fine-tuned on NIST.
+
+Our results show that the MLP benefits marginally from the multi-task setting
+in the case of digits (5\% relative improvement) but is actually hurt in the case
+of characters (respectively 3\% and 4\% worse for lower and upper class characters).
+On the other hand the SDA benefited from the multi-task setting, with relative
+error rate improvements of 27\%, 15\% and 13\% respectively for digits,
+lower and upper case characters, as shown in Table~\ref{tab:multi-task}.
+\fi
+
+
+\vspace*{-2mm}
+\section{Conclusions and Discussion}
+\vspace*{-2mm}
+
+We have found that the self-taught learning framework is more beneficial
+to a deep learner than to a traditional shallow and purely
+supervised learner. More precisely, 
+the answers are positive for all the questions asked in the introduction.
+%\begin{itemize}
+
+$\bullet$ %\item 
+{\bf Do the good results previously obtained with deep architectures on the
+MNIST digits generalize to a much larger and richer (but similar)
+dataset, the NIST special database 19, with 62 classes and around 800k examples}?
+Yes, the SDA {\em systematically outperformed the MLP and all the previously
+published results on this dataset} (the ones that we are aware of), {\em in fact reaching human-level
+performance} at around 17\% error on the 62-class task and 1.4\% on the digits,
+and beating previously published results on the same data.
+
+$\bullet$ %\item 
+{\bf To what extent do self-taught learning scenarios help deep learners,
+and do they help them more than shallow supervised ones}?
+We found that distorted training examples not only made the resulting
+classifier better on similarly perturbed images but also on
+the {\em original clean examples}, and more importantly and more novel,
+that deep architectures benefit more from such {\em out-of-distribution}
+examples. MLPs were helped by perturbed training examples when tested on perturbed input 
+images (65\% relative improvement on NISTP) 
+but only marginally helped (5\% relative improvement on all classes) 
+or even hurt (10\% relative loss on digits)
+with respect to clean examples . On the other hand, the deep SDAs
+were significantly boosted by these out-of-distribution examples.
+Similarly, whereas the improvement due to the multi-task setting was marginal or
+negative for the MLP (from +5.6\% to -3.6\% relative change), 
+it was quite significant for the SDA (from +13\% to +27\% relative change),
+which may be explained by the arguments below.
+%\end{itemize}
+
+In the original self-taught learning framework~\citep{RainaR2007}, the
+out-of-sample examples were used as a source of unsupervised data, and
+experiments showed its positive effects in a \emph{limited labeled data}
+scenario. However, many of the results by \citet{RainaR2007} (who used a
+shallow, sparse coding approach) suggest that the {\em relative gain of self-taught
+learning vs ordinary supervised learning} diminishes as the number of labeled examples increases.
+We note instead that, for deep
+architectures, our experiments show that such a positive effect is accomplished
+even in a scenario with a \emph{large number of labeled examples},
+i.e., here, the relative gain of self-taught learning is probably preserved
+in the asymptotic regime.
+
+{\bf Why would deep learners benefit more from the self-taught learning framework}?
+The key idea is that the lower layers of the predictor compute a hierarchy
+of features that can be shared across tasks or across variants of the
+input distribution. A theoretical analysis of generalization improvements
+due to sharing of intermediate features across tasks already points
+towards that explanation~\cite{baxter95a}.
+Intermediate features that can be used in different
+contexts can be estimated in a way that allows to share statistical 
+strength. Features extracted through many levels are more likely to
+be more abstract and more invariant to some of the factors of variation
+in the underlying distribution (as the experiments in~\citet{Goodfellow2009} suggest),
+increasing the likelihood that they would be useful for a larger array
+of tasks and input conditions.
+Therefore, we hypothesize that both depth and unsupervised
+pre-training play a part in explaining the advantages observed here, and future
+experiments could attempt at teasing apart these factors.
+And why would deep learners benefit from the self-taught learning
+scenarios even when the number of labeled examples is very large?
+We hypothesize that this is related to the hypotheses studied
+in~\citet{Erhan+al-2010}. In~\citet{Erhan+al-2010}
+it was found that online learning on a huge dataset did not make the
+advantage of the deep learning bias vanish, and a similar phenomenon
+may be happening here. We hypothesize that unsupervised pre-training
+of a deep hierarchy with self-taught learning initializes the
+model in the basin of attraction of supervised gradient descent
+that corresponds to better generalization. Furthermore, such good
+basins of attraction are not discovered by pure supervised learning
+(with or without self-taught settings) from random initialization, and more labeled examples
+does not allow the shallow or purely supervised models to discover
+the kind of better basins associated
+with deep learning and self-taught learning.
+ 
+A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 
+can be executed on-line at {\tt http://deep.host22.com}.
+
+\iffalse
+\section*{Appendix I: Detailed Numerical Results}
+
+These tables correspond to Figures 2 and 3 and contain the raw error rates for each model and dataset considered.
+They also contain additional data such as test errors on P07 and standard errors.
+
+\begin{table}[ht]
+\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
+26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
+(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
+(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
+and using a validation set to select hyper-parameters and other training choices. 
+\{SDA,MLP\}0 are trained on NIST,
+\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
+The human error rate on digits is a lower bound because it does not count digits that were
+recognized as letters. For comparison, the results found in the literature
+on NIST digits classification using the same test set are included.}
+\label{tab:sda-vs-mlp-vs-humans}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
+Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
+SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
+SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
+SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
+MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
+MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
+MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
+\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
+\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
+\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
+\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Relative change in error rates due to the use of perturbed training data,
+either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
+A positive value indicates that training on the perturbed data helped for the
+given test set (the first 3 columns on the 62-class tasks and the last one is
+on the clean 10-class digits). Clearly, the deep learning models did benefit more
+from perturbed training data, even when testing on clean data, whereas the MLP
+trained on perturbed data performed worse on the clean digits and about the same
+on the clean characters. }
+\label{tab:perturbation-effect}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
+SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
+SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
+MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
+MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[ht]
+\caption{Test error rates and relative change in error rates due to the use of
+a multi-task setting, i.e., training on each task in isolation vs training
+for all three tasks together, for MLPs vs SDAs. The SDA benefits much
+more from the multi-task setting. All experiments on only on the
+unperturbed NIST data, using validation error for model selection.
+Relative improvement is 1 - single-task error / multi-task error.}
+\label{tab:multi-task}
+\begin{center}
+\begin{tabular}{|l|r|r|r|} \hline
+             & single-task  & multi-task  & relative \\ 
+             & setting      & setting     & improvement \\ \hline
+MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
+MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
+MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
+SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
+SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
+SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\fi
+
+%\afterpage{\clearpage}
+%\clearpage
+{
+%\bibliographystyle{spbasic}      % basic style, author-year citations
+\bibliographystyle{plainnat}
+\bibliography{strings,strings-short,strings-shorter,ift6266_ml,specials,aigaion-shorter}
+%\bibliographystyle{unsrtnat}
+%\bibliographystyle{apalike}
+}
+
+
+\end{document}
--- a/writeup/strings-shorter.bib	Sat Mar 19 22:49:33 2011 -0400
+++ b/writeup/strings-shorter.bib	Sat Mar 19 22:51:40 2011 -0400
@@ -81,20 +81,20 @@
 @String{ICDAR03 =  "Proc. {ICDAR}'03"}
 @String{ICDAR07 =  "Proc. {ICDAR}'07"}
 
-@String{ICML96 = "{ICML} 1996"}
-@String{ICML97 = "{ICML} 1997"}
-@String{ICML98 = "{ICML} 1998"}
-@String{ICML99 = "{ICML} 1999"}
-@String{ICML00 = "{ICML} 2000"}
-@String{ICML01 = "{ICML} 2001"}
-@String{ICML02 = "{ICML} 2002"}
-@String{ICML03 = "{ICML} 2003"}
-@String{ICML04 = "{ICML} 2004"}
-@String{ICML05 = "{ICML} 2005"}
-@String{ICML06 = "{ICML} 2006"}
-@String{ICML07 = "{ICML} 2007"}
-@String{ICML08 = "{ICML} 2008"}
-@String{ICML09 = "{ICML} 2009"}
+@String{ICML96 = "{ICML}"}
+@String{ICML97 = "{ICML}"}
+@String{ICML98 = "{ICML}"}
+@String{ICML99 = "{ICML}"}
+@String{ICML00 = "{ICML}"}
+@String{ICML01 = "{ICML}"}
+@String{ICML02 = "{ICML}"}
+@String{ICML03 = "{ICML}"}
+@String{ICML04 = "{ICML}"}
+@String{ICML05 = "{ICML}"}
+@String{ICML06 = "{ICML}"}
+@String{ICML07 = "{ICML}"}
+@String{ICML08 = "{ICML}"}
+@String{ICML09 = "{ICML}"}
 @string{icml09loc = {}}
 @STRING{aistats05 = "AISTATS'2005"}
 @STRING{aistats07 = "AISTATS'2007"}