# HG changeset patch # User Olivier Delalleau # Date 1275585413 14400 # Node ID 08709b62e5748afe09fa1441da13c5e884ddb697 # Parent b9b811e886ae236955c6d7c15e8cf994c16f2e99# Parent 608c06e4008b4ea68d2c5a46934e3025b6a670bf Merged diff -r 608c06e4008b -r 08709b62e574 writeup/ift6266_ml.bib --- a/writeup/ift6266_ml.bib Thu Jun 03 13:12:47 2010 -0400 +++ b/writeup/ift6266_ml.bib Thu Jun 03 13:16:53 2010 -0400 @@ -25816,7 +25816,7 @@ @inproceedings{Grother-1995, AUTHOR = "Grother, P.J.", - TITLE = "Handprinted Forms and Character Database, NIST Special Database 19", + TITLE = "Handprinted Forms and Character Database, {NIST} Special Database 19", BOOKTITLE = "National Institute of Standards and Technology (NIST) Intelligent Systems Division (NISTIR)", YEAR = "1995", BIBSOURCE = "http://www.visionbib.com/bibliography/char1015.html#TT105853"} @@ -25836,4 +25836,4 @@ Training Data for Deep Architectures", institution = "University X.", year = 2010, -} \ No newline at end of file +} diff -r 608c06e4008b -r 08709b62e574 writeup/nips2010_submission.tex --- a/writeup/nips2010_submission.tex Thu Jun 03 13:12:47 2010 -0400 +++ b/writeup/nips2010_submission.tex Thu Jun 03 13:16:53 2010 -0400 @@ -510,9 +510,10 @@ To provide a baseline of error rate comparison we also estimate human performance on both the 62-class task and the 10-class digits task. -We compare the best MLPs against -the best SDAs (both models' hyper-parameters are selected to minimize the validation set error), -along with a comparison against a precise estimate +We compare the best Multi-Layer Perceptrons (MLP) against +the best Stacked Denoising Auto-encoders (SDA), when +both models' hyper-parameters are selected to minimize the validation set error. +We also provide a comparison against a precise estimate of human performance obtained via Amazon's Mechanical Turk (AMT) service (http://mturk.com). AMT users are paid small amounts @@ -552,7 +553,7 @@ useful to estimate the effect of a multi-task setting. The distribution of the classes in the NIST training and test sets differs substantially, with relatively many more digits in the test set, and a more uniform distribution -of letters in the test set (where the letters are distributed +of letters in the test set (whereas in the training set they are distributed more like in natural text). \vspace*{-1mm} @@ -623,8 +624,8 @@ \subsection{Models and their Hyperparameters} \vspace*{-2mm} -The experiments are performed with Multi-Layer Perceptrons (MLP) with a single -hidden layer and with Stacked Denoising Auto-Encoders (SDA). +The experiments are performed using MLPs (with a single +hidden layer) and SDAs. \emph{Hyper-parameters are selected based on the {\bf NISTP} validation set error.} {\bf Multi-Layer Perceptrons (MLP).} @@ -638,7 +639,8 @@ Training examples are presented in minibatches of size 20. A constant learning rate was chosen among $\{0.001, 0.01, 0.025, 0.075, 0.1, 0.5\}$ through preliminary experiments (measuring performance on a validation set), -and $0.1$ was then selected for optimizing on the whole training sets. +and $0.1$ (which was found to work best) was then selected for optimizing on +the whole training sets. \vspace*{-1mm} @@ -674,14 +676,14 @@ \end{figure} Here we chose to use the Denoising -Auto-Encoder~\citep{VincentPLarochelleH2008} as the building block for +Auto-encoder~\citep{VincentPLarochelleH2008} as the building block for these deep hierarchies of features, as it is very simple to train and explain (see Figure~\ref{fig:da}, as well as tutorial and code there: {\tt http://deeplearning.net/tutorial}), -provides immediate and efficient inference, and yielded results +provides efficient inference, and yielded results comparable or better than RBMs in series of experiments \citep{VincentPLarochelleH2008}. During training, a Denoising -Auto-Encoder is presented with a stochastically corrupted version +Auto-encoder is presented with a stochastically corrupted version of the input and trained to reconstruct the uncorrupted input, forcing the hidden units to represent the leading regularities in the data. Once it is trained, in a purely unsupervised way, @@ -744,7 +746,7 @@ Figure~\ref{fig:error-rates-charts} summarizes the results obtained, comparing humans, the three MLPs (MLP0, MLP1, MLP2) and the three SDAs (SDA0, SDA1, SDA2), along with the previous results on the digits NIST special database -19 test set from the literature respectively based on ARTMAP neural +19 test set from the literature, respectively based on ARTMAP neural networks ~\citep{Granger+al-2007}, fast nearest-neighbor search ~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs ~\citep{Milgram+al-2005}. More detailed and complete numerical results @@ -780,8 +782,8 @@ for the SDA. Note that to simplify these multi-task experiments, only the original NIST dataset is used. For example, the MLP-digits bar shows the relative percent improvement in MLP error rate on the NIST digits test set -is $100\% \times$ (1 - single-task -model's error / multi-task model's error). The single-task model is +is $100\% \times$ (single-task +model's error / multi-task model's error - 1). The single-task model is trained with only 10 outputs (one per digit), seeing only digit examples, whereas the multi-task model is trained with 62 outputs, with all 62 character classes as examples. Hence the hidden units are shared across