diff writeup/nips2010_submission.tex @ 485:6beaf3328521

les tables enlevées
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 31 May 2010 21:50:00 -0400
parents 9a757d565e46
children 877af97ee193 6c9ff48e15cd
line wrap: on
line diff
--- a/writeup/nips2010_submission.tex	Mon May 31 20:42:22 2010 -0400
+++ b/writeup/nips2010_submission.tex	Mon May 31 21:50:00 2010 -0400
@@ -461,56 +461,58 @@
 \vspace*{-1mm}
 \section{Experimental Results}
 
-\vspace*{-1mm}
-\subsection{SDA vs MLP vs Humans}
-\vspace*{-1mm}
+%\vspace*{-1mm}
+%\subsection{SDA vs MLP vs Humans}
+%\vspace*{-1mm}
 
-We compare here the best MLP (according to validation set error) that we found against
+We compare the best MLP (according to validation set error) that we found against
 the best SDA (again according to validation set error), along with a precise estimate
 of human performance obtained via Amazon's Mechanical Turk (AMT)
-service\footnote{http://mturk.com}. AMT users are paid small amounts
-of money to perform tasks for which human intelligence is required.
-Mechanical Turk has been used extensively in natural language
-processing \citep{SnowEtAl2008} and vision
-\citep{SorokinAndForsyth2008,whitehill09}. AMT users where presented
+service\footnote{http://mturk.com}. 
+%AMT users are paid small amounts
+%of money to perform tasks for which human intelligence is required.
+%Mechanical Turk has been used extensively in natural language
+%processing \citep{SnowEtAl2008} and vision
+%\citep{SorokinAndForsyth2008,whitehill09}. 
+AMT users where presented
 with 10 character images and asked to type 10 corresponding ascii
 characters. They were forced to make a hard choice among the
 62 or 10 character classes (all classes or digits only). 
 Three users classified each image, allowing
 to estimate inter-human variability (shown as +/- in parenthesis below).
 
-Figure~\ref{fig:error-rates-charts} summarizes the results obtained.
-More detailed results and tables can be found in the appendix.
+Figure~\ref{fig:error-rates-charts} summarizes the results obtained,
+comparing Humans, three MLPs (MLP0, MLP1, MLP2) and three SDAs (SDA0, SDA1,
+SDA2), along with the previous results on the digits NIST special database 19
+test set from the
+literature
+respectively based on ARTMAP neural networks
+~\citep{Granger+al-2007}, fast nearest-neighbor search
+~\citep{Cortes+al-2000}, MLPs
+~\citep{Oliveira+al-2002}, and SVMs
+~\citep{Milgram+al-2005}.
+More detailed and complete numerical results (figures and tables) 
+can be found in the appendix.  The 3 kinds of model differ in the
+training sets used: NIST only (MLP0,SDA0), NISTP (MLP1, SDA1),
+or P07 (MLP2, SDA2). The deep learner not only outperformed
+the shallow ones and previously published performance 
+but reaches human performance on both the 62-class
+task and the 10-class (digits) task. In addition, as shown
+in the left of Figure~\ref{fig:fig:improvements-charts},
+the relative improvement in error rate brought by
+self-taught learning is greater for the SDA. The left
+side shows the improvement to the clean NIST test set error
+brought by the use of out-of-distribution
+examples (i.e. the perturbed examples examples from NISTP
+or P07). The right side of Figure~\ref{fig:fig:improvements-charts}
+shows the relative improvement brought by the use
+of a multi-task setting, in which the same model is trained
+for more classes than the target classes of interest
+(i.e. training with all 62 classes when the target classes
+are respectively the digits, lower-case, or upper-case
+characters). Again, whereas the gain is marginal
+or negative for the MLP, it is substantial for the SDA.
 
-\begin{table}
-\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
-26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
-(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
-(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
-and using a validation set to select hyper-parameters and other training choices. 
-\{SDA,MLP\}0 are trained on NIST,
-\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
-The human error rate on digits is a lower bound because it does not count digits that were
-recognized as letters. For comparison, the results found in the literature
-on NIST digits classification using the same test set are included.}
-\label{tab:sda-vs-mlp-vs-humans}
-\begin{center}
-\begin{tabular}{|l|r|r|r|r|} \hline
-      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
-Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
-SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
-SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
-SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
-MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
-MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
-MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
-\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
-\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
-\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
-\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
-\end{tabular}
-\end{center}
-\end{table}
 
 \begin{figure}[h]
 \resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}\\
@@ -518,35 +520,15 @@
 \label{fig:error-rates-charts}
 \end{figure}
 
-\vspace*{-1mm}
-\subsection{Perturbed Training Data More Helpful for SDAE}
-\vspace*{-1mm}
+%\vspace*{-1mm}
+%\subsection{Perturbed Training Data More Helpful for SDAE}
+%\vspace*{-1mm}
 
-\begin{table}
-\caption{Relative change in error rates due to the use of perturbed training data,
-either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
-A positive value indicates that training on the perturbed data helped for the
-given test set (the first 3 columns on the 62-class tasks and the last one is
-on the clean 10-class digits). Clearly, the deep learning models did benefit more
-from perturbed training data, even when testing on clean data, whereas the MLP
-trained on perturbed data performed worse on the clean digits and about the same
-on the clean characters. }
-\label{tab:perturbation-effect}
-\begin{center}
-\begin{tabular}{|l|r|r|r|r|} \hline
-      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
-SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
-SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
-MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
-MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
-\end{tabular}
-\end{center}
-\end{table}
+%\vspace*{-1mm}
+%\subsection{Multi-Task Learning Effects}
+%\vspace*{-1mm}
 
-\vspace*{-1mm}
-\subsection{Multi-Task Learning Effects}
-\vspace*{-1mm}
-
+\iffalse
 As previously seen, the SDA is better able to benefit from the
 transformations applied to the data than the MLP. In this experiment we
 define three tasks: recognizing digits (knowing that the input is a digit),
@@ -569,28 +551,7 @@
 On the other hand the SDA benefitted from the multi-task setting, with relative
 error rate improvements of 27\%, 15\% and 13\% respectively for digits,
 lower and upper case characters, as shown in Table~\ref{tab:multi-task}.
-
-\begin{table}
-\caption{Test error rates and relative change in error rates due to the use of
-a multi-task setting, i.e., training on each task in isolation vs training
-for all three tasks together, for MLPs vs SDAs. The SDA benefits much
-more from the multi-task setting. All experiments on only on the
-unperturbed NIST data, using validation error for model selection.
-Relative improvement is 1 - single-task error / multi-task error.}
-\label{tab:multi-task}
-\begin{center}
-\begin{tabular}{|l|r|r|r|} \hline
-             & single-task  & multi-task  & relative \\ 
-             & setting      & setting     & improvement \\ \hline
-MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
-MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
-MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
-SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
-SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
-SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
-\end{tabular}
-\end{center}
-\end{table}
+\fi
 
 
 \begin{figure}[h]
@@ -641,4 +602,84 @@
 %\bibliographystyle{apalike}
 }
 
+\newpage
+
+\centerline{APPENDIX FOR {\bf Deep Self-Taught Learning for Handwritten Character Recognition}}
+
+\vspace*{1cm}
+
+\begin{table}[h]
+\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
+26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
+(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
+(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
+and using a validation set to select hyper-parameters and other training choices. 
+\{SDA,MLP\}0 are trained on NIST,
+\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
+The human error rate on digits is a lower bound because it does not count digits that were
+recognized as letters. For comparison, the results found in the literature
+on NIST digits classification using the same test set are included.}
+\label{tab:sda-vs-mlp-vs-humans}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
+Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
+SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
+SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
+SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
+MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
+MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
+MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
+\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
+\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
+\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
+\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[h]
+\caption{Relative change in error rates due to the use of perturbed training data,
+either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
+A positive value indicates that training on the perturbed data helped for the
+given test set (the first 3 columns on the 62-class tasks and the last one is
+on the clean 10-class digits). Clearly, the deep learning models did benefit more
+from perturbed training data, even when testing on clean data, whereas the MLP
+trained on perturbed data performed worse on the clean digits and about the same
+on the clean characters. }
+\label{tab:perturbation-effect}
+\begin{center}
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
+SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
+SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
+MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
+MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+\begin{table}[h]
+\caption{Test error rates and relative change in error rates due to the use of
+a multi-task setting, i.e., training on each task in isolation vs training
+for all three tasks together, for MLPs vs SDAs. The SDA benefits much
+more from the multi-task setting. All experiments on only on the
+unperturbed NIST data, using validation error for model selection.
+Relative improvement is 1 - single-task error / multi-task error.}
+\label{tab:multi-task}
+\begin{center}
+\begin{tabular}{|l|r|r|r|} \hline
+             & single-task  & multi-task  & relative \\ 
+             & setting      & setting     & improvement \\ \hline
+MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
+MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
+MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
+SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
+SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
+SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
+\end{tabular}
+\end{center}
+\end{table}
+
+
 \end{document}