# HG changeset patch # User Yoshua Bengio # Date 1275357000 14400 # Node ID 6beaf3328521c8f262bd1e5ef8e1b26efdb88b88 # Parent 9a757d565e468e2d52b6535e6ce84f08f51155c7 les tables enlevées diff -r 9a757d565e46 -r 6beaf3328521 writeup/nips2010_submission.tex --- a/writeup/nips2010_submission.tex Mon May 31 20:42:22 2010 -0400 +++ b/writeup/nips2010_submission.tex Mon May 31 21:50:00 2010 -0400 @@ -461,56 +461,58 @@ \vspace*{-1mm} \section{Experimental Results} -\vspace*{-1mm} -\subsection{SDA vs MLP vs Humans} -\vspace*{-1mm} +%\vspace*{-1mm} +%\subsection{SDA vs MLP vs Humans} +%\vspace*{-1mm} -We compare here the best MLP (according to validation set error) that we found against +We compare the best MLP (according to validation set error) that we found against the best SDA (again according to validation set error), along with a precise estimate of human performance obtained via Amazon's Mechanical Turk (AMT) -service\footnote{http://mturk.com}. AMT users are paid small amounts -of money to perform tasks for which human intelligence is required. -Mechanical Turk has been used extensively in natural language -processing \citep{SnowEtAl2008} and vision -\citep{SorokinAndForsyth2008,whitehill09}. AMT users where presented +service\footnote{http://mturk.com}. +%AMT users are paid small amounts +%of money to perform tasks for which human intelligence is required. +%Mechanical Turk has been used extensively in natural language +%processing \citep{SnowEtAl2008} and vision +%\citep{SorokinAndForsyth2008,whitehill09}. +AMT users where presented with 10 character images and asked to type 10 corresponding ascii characters. They were forced to make a hard choice among the 62 or 10 character classes (all classes or digits only). Three users classified each image, allowing to estimate inter-human variability (shown as +/- in parenthesis below). -Figure~\ref{fig:error-rates-charts} summarizes the results obtained. -More detailed results and tables can be found in the appendix. +Figure~\ref{fig:error-rates-charts} summarizes the results obtained, +comparing Humans, three MLPs (MLP0, MLP1, MLP2) and three SDAs (SDA0, SDA1, +SDA2), along with the previous results on the digits NIST special database 19 +test set from the +literature +respectively based on ARTMAP neural networks +~\citep{Granger+al-2007}, fast nearest-neighbor search +~\citep{Cortes+al-2000}, MLPs +~\citep{Oliveira+al-2002}, and SVMs +~\citep{Milgram+al-2005}. +More detailed and complete numerical results (figures and tables) +can be found in the appendix. The 3 kinds of model differ in the +training sets used: NIST only (MLP0,SDA0), NISTP (MLP1, SDA1), +or P07 (MLP2, SDA2). The deep learner not only outperformed +the shallow ones and previously published performance +but reaches human performance on both the 62-class +task and the 10-class (digits) task. In addition, as shown +in the left of Figure~\ref{fig:fig:improvements-charts}, +the relative improvement in error rate brought by +self-taught learning is greater for the SDA. The left +side shows the improvement to the clean NIST test set error +brought by the use of out-of-distribution +examples (i.e. the perturbed examples examples from NISTP +or P07). The right side of Figure~\ref{fig:fig:improvements-charts} +shows the relative improvement brought by the use +of a multi-task setting, in which the same model is trained +for more classes than the target classes of interest +(i.e. training with all 62 classes when the target classes +are respectively the digits, lower-case, or upper-case +characters). Again, whereas the gain is marginal +or negative for the MLP, it is substantial for the SDA. -\begin{table} -\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits + -26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training -(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture -(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07) -and using a validation set to select hyper-parameters and other training choices. -\{SDA,MLP\}0 are trained on NIST, -\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07. -The human error rate on digits is a lower bound because it does not count digits that were -recognized as letters. For comparison, the results found in the literature -on NIST digits classification using the same test set are included.} -\label{tab:sda-vs-mlp-vs-humans} -\begin{center} -\begin{tabular}{|l|r|r|r|r|} \hline - & NIST test & NISTP test & P07 test & NIST test digits \\ \hline -Humans& 18.2\% $\pm$.1\% & 39.4\%$\pm$.1\% & 46.9\%$\pm$.1\% & $1.4\%$ \\ \hline -SDA0 & 23.7\% $\pm$.14\% & 65.2\%$\pm$.34\% & 97.45\%$\pm$.06\% & 2.7\% $\pm$.14\%\\ \hline -SDA1 & 17.1\% $\pm$.13\% & 29.7\%$\pm$.3\% & 29.7\%$\pm$.3\% & 1.4\% $\pm$.1\%\\ \hline -SDA2 & 18.7\% $\pm$.13\% & 33.6\%$\pm$.3\% & 39.9\%$\pm$.17\% & 1.7\% $\pm$.1\%\\ \hline -MLP0 & 24.2\% $\pm$.15\% & 68.8\%$\pm$.33\% & 78.70\%$\pm$.14\% & 3.45\% $\pm$.15\% \\ \hline -MLP1 & 23.0\% $\pm$.15\% & 41.8\%$\pm$.35\% & 90.4\%$\pm$.1\% & 3.85\% $\pm$.16\% \\ \hline -MLP2 & 24.3\% $\pm$.15\% & 46.0\%$\pm$.35\% & 54.7\%$\pm$.17\% & 4.85\% $\pm$.18\% \\ \hline -\citep{Granger+al-2007} & & & & 4.95\% $\pm$.18\% \\ \hline -\citep{Cortes+al-2000} & & & & 3.71\% $\pm$.16\% \\ \hline -\citep{Oliveira+al-2002} & & & & 2.4\% $\pm$.13\% \\ \hline -\citep{Milgram+al-2005} & & & & 2.1\% $\pm$.12\% \\ \hline -\end{tabular} -\end{center} -\end{table} \begin{figure}[h] \resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}\\ @@ -518,35 +520,15 @@ \label{fig:error-rates-charts} \end{figure} -\vspace*{-1mm} -\subsection{Perturbed Training Data More Helpful for SDAE} -\vspace*{-1mm} +%\vspace*{-1mm} +%\subsection{Perturbed Training Data More Helpful for SDAE} +%\vspace*{-1mm} -\begin{table} -\caption{Relative change in error rates due to the use of perturbed training data, -either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models. -A positive value indicates that training on the perturbed data helped for the -given test set (the first 3 columns on the 62-class tasks and the last one is -on the clean 10-class digits). Clearly, the deep learning models did benefit more -from perturbed training data, even when testing on clean data, whereas the MLP -trained on perturbed data performed worse on the clean digits and about the same -on the clean characters. } -\label{tab:perturbation-effect} -\begin{center} -\begin{tabular}{|l|r|r|r|r|} \hline - & NIST test & NISTP test & P07 test & NIST test digits \\ \hline -SDA0/SDA1-1 & 38\% & 84\% & 228\% & 93\% \\ \hline -SDA0/SDA2-1 & 27\% & 94\% & 144\% & 59\% \\ \hline -MLP0/MLP1-1 & 5.2\% & 65\% & -13\% & -10\% \\ \hline -MLP0/MLP2-1 & -0.4\% & 49\% & 44\% & -29\% \\ \hline -\end{tabular} -\end{center} -\end{table} +%\vspace*{-1mm} +%\subsection{Multi-Task Learning Effects} +%\vspace*{-1mm} -\vspace*{-1mm} -\subsection{Multi-Task Learning Effects} -\vspace*{-1mm} - +\iffalse As previously seen, the SDA is better able to benefit from the transformations applied to the data than the MLP. In this experiment we define three tasks: recognizing digits (knowing that the input is a digit), @@ -569,28 +551,7 @@ On the other hand the SDA benefitted from the multi-task setting, with relative error rate improvements of 27\%, 15\% and 13\% respectively for digits, lower and upper case characters, as shown in Table~\ref{tab:multi-task}. - -\begin{table} -\caption{Test error rates and relative change in error rates due to the use of -a multi-task setting, i.e., training on each task in isolation vs training -for all three tasks together, for MLPs vs SDAs. The SDA benefits much -more from the multi-task setting. All experiments on only on the -unperturbed NIST data, using validation error for model selection. -Relative improvement is 1 - single-task error / multi-task error.} -\label{tab:multi-task} -\begin{center} -\begin{tabular}{|l|r|r|r|} \hline - & single-task & multi-task & relative \\ - & setting & setting & improvement \\ \hline -MLP-digits & 3.77\% & 3.99\% & 5.6\% \\ \hline -MLP-lower & 17.4\% & 16.8\% & -4.1\% \\ \hline -MLP-upper & 7.84\% & 7.54\% & -3.6\% \\ \hline -SDA-digits & 2.6\% & 3.56\% & 27\% \\ \hline -SDA-lower & 12.3\% & 14.4\% & 15\% \\ \hline -SDA-upper & 5.93\% & 6.78\% & 13\% \\ \hline -\end{tabular} -\end{center} -\end{table} +\fi \begin{figure}[h] @@ -641,4 +602,84 @@ %\bibliographystyle{apalike} } +\newpage + +\centerline{APPENDIX FOR {\bf Deep Self-Taught Learning for Handwritten Character Recognition}} + +\vspace*{1cm} + +\begin{table}[h] +\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits + +26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training +(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture +(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07) +and using a validation set to select hyper-parameters and other training choices. +\{SDA,MLP\}0 are trained on NIST, +\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07. +The human error rate on digits is a lower bound because it does not count digits that were +recognized as letters. For comparison, the results found in the literature +on NIST digits classification using the same test set are included.} +\label{tab:sda-vs-mlp-vs-humans} +\begin{center} +\begin{tabular}{|l|r|r|r|r|} \hline + & NIST test & NISTP test & P07 test & NIST test digits \\ \hline +Humans& 18.2\% $\pm$.1\% & 39.4\%$\pm$.1\% & 46.9\%$\pm$.1\% & $1.4\%$ \\ \hline +SDA0 & 23.7\% $\pm$.14\% & 65.2\%$\pm$.34\% & 97.45\%$\pm$.06\% & 2.7\% $\pm$.14\%\\ \hline +SDA1 & 17.1\% $\pm$.13\% & 29.7\%$\pm$.3\% & 29.7\%$\pm$.3\% & 1.4\% $\pm$.1\%\\ \hline +SDA2 & 18.7\% $\pm$.13\% & 33.6\%$\pm$.3\% & 39.9\%$\pm$.17\% & 1.7\% $\pm$.1\%\\ \hline +MLP0 & 24.2\% $\pm$.15\% & 68.8\%$\pm$.33\% & 78.70\%$\pm$.14\% & 3.45\% $\pm$.15\% \\ \hline +MLP1 & 23.0\% $\pm$.15\% & 41.8\%$\pm$.35\% & 90.4\%$\pm$.1\% & 3.85\% $\pm$.16\% \\ \hline +MLP2 & 24.3\% $\pm$.15\% & 46.0\%$\pm$.35\% & 54.7\%$\pm$.17\% & 4.85\% $\pm$.18\% \\ \hline +\citep{Granger+al-2007} & & & & 4.95\% $\pm$.18\% \\ \hline +\citep{Cortes+al-2000} & & & & 3.71\% $\pm$.16\% \\ \hline +\citep{Oliveira+al-2002} & & & & 2.4\% $\pm$.13\% \\ \hline +\citep{Milgram+al-2005} & & & & 2.1\% $\pm$.12\% \\ \hline +\end{tabular} +\end{center} +\end{table} + +\begin{table}[h] +\caption{Relative change in error rates due to the use of perturbed training data, +either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models. +A positive value indicates that training on the perturbed data helped for the +given test set (the first 3 columns on the 62-class tasks and the last one is +on the clean 10-class digits). Clearly, the deep learning models did benefit more +from perturbed training data, even when testing on clean data, whereas the MLP +trained on perturbed data performed worse on the clean digits and about the same +on the clean characters. } +\label{tab:perturbation-effect} +\begin{center} +\begin{tabular}{|l|r|r|r|r|} \hline + & NIST test & NISTP test & P07 test & NIST test digits \\ \hline +SDA0/SDA1-1 & 38\% & 84\% & 228\% & 93\% \\ \hline +SDA0/SDA2-1 & 27\% & 94\% & 144\% & 59\% \\ \hline +MLP0/MLP1-1 & 5.2\% & 65\% & -13\% & -10\% \\ \hline +MLP0/MLP2-1 & -0.4\% & 49\% & 44\% & -29\% \\ \hline +\end{tabular} +\end{center} +\end{table} + +\begin{table}[h] +\caption{Test error rates and relative change in error rates due to the use of +a multi-task setting, i.e., training on each task in isolation vs training +for all three tasks together, for MLPs vs SDAs. The SDA benefits much +more from the multi-task setting. All experiments on only on the +unperturbed NIST data, using validation error for model selection. +Relative improvement is 1 - single-task error / multi-task error.} +\label{tab:multi-task} +\begin{center} +\begin{tabular}{|l|r|r|r|} \hline + & single-task & multi-task & relative \\ + & setting & setting & improvement \\ \hline +MLP-digits & 3.77\% & 3.99\% & 5.6\% \\ \hline +MLP-lower & 17.4\% & 16.8\% & -4.1\% \\ \hline +MLP-upper & 7.84\% & 7.54\% & -3.6\% \\ \hline +SDA-digits & 2.6\% & 3.56\% & 27\% \\ \hline +SDA-lower & 12.3\% & 14.4\% & 15\% \\ \hline +SDA-upper & 5.93\% & 6.78\% & 13\% \\ \hline +\end{tabular} +\end{center} +\end{table} + + \end{document}