view writeup/nips2010_submission_supplementary.tex @ 493:a194ce5a4249

difference stat. sign.
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Tue, 01 Jun 2010 07:55:38 -0400
parents 6c9ff48e15cd
children bf481414ba9c
line wrap: on
line source

\documentclass{article} % For LaTeX2e
\usepackage{nips10submit_e,times}

\usepackage{amsthm,amsmath,amssymb,bbold,bbm} 
\usepackage{algorithm,algorithmic}
\usepackage[utf8]{inputenc}
\usepackage{graphicx,subfigure}
\usepackage[numbers]{natbib}

\title{Deep Self-Taught Learning for Handwritten Character Recognition\\
\emph{Supplementary Material}}

\begin{document}

\maketitle

These tables correspond to Figures 3 and 4 and contain the raw error rates for each model and dataset considered.

\begin{table}[h]
\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
and using a validation set to select hyper-parameters and other training choices. 
\{SDA,MLP\}0 are trained on NIST,
\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
The human error rate on digits is a lower bound because it does not count digits that were
recognized as letters. For comparison, the results found in the literature
on NIST digits classification using the same test set are included.}
\label{tab:sda-vs-mlp-vs-humans}
\begin{center}
\begin{tabular}{|l|r|r|r|r|} \hline
      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline 
SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline 
SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline 
SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline 
MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline 
MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline 
MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline 
\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[h]
\caption{Relative change in error rates due to the use of perturbed training data,
either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
A positive value indicates that training on the perturbed data helped for the
given test set (the first 3 columns on the 62-class tasks and the last one is
on the clean 10-class digits). Clearly, the deep learning models did benefit more
from perturbed training data, even when testing on clean data, whereas the MLP
trained on perturbed data performed worse on the clean digits and about the same
on the clean characters. }
\label{tab:perturbation-effect}
\begin{center}
\begin{tabular}{|l|r|r|r|r|} \hline
      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline 
SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline 
MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline 
MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline 
\end{tabular}
\end{center}
\end{table}

\begin{table}[h]
\caption{Test error rates and relative change in error rates due to the use of
a multi-task setting, i.e., training on each task in isolation vs training
for all three tasks together, for MLPs vs SDAs. The SDA benefits much
more from the multi-task setting. All experiments on only on the
unperturbed NIST data, using validation error for model selection.
Relative improvement is 1 - single-task error / multi-task error.}
\label{tab:multi-task}
\begin{center}
\begin{tabular}{|l|r|r|r|} \hline
             & single-task  & multi-task  & relative \\ 
             & setting      & setting     & improvement \\ \hline
MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline 
MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline 
MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline 
SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline 
SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline 
SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline 
\end{tabular}
\end{center}
\end{table}

{\small
\bibliography{strings,ml,aigaion,specials}
%\bibliographystyle{plainnat}
\bibliographystyle{unsrtnat}
%\bibliographystyle{apalike}
}

\end{document}