Mercurial > ift6266

\documentclass{article} % For LaTeX2e
\usepackage{nips10submit_e,times}

\usepackage{amsthm,amsmath,amssymb,bbold,bbm}
\usepackage{algorithm,algorithmic}
\usepackage[utf8]{inputenc}
\usepackage{graphicx,subfigure}
\usepackage[numbers]{natbib}

\title{Deep Self-Taught Learning for Handwritten Character Recognition\\
\emph{Supplementary Material}}

\begin{document}

\maketitle

\section*{Appendix I: Full results}

These tables correspond to Figures 2 and 3 and contain the raw error rates for each model and dataset considered.
They also contain additional data such as test errors on P07 and standard errors.

\begin{table}[ht]
\caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits +
26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture
(MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07)
and using a validation set to select hyper-parameters and other training choices.
\{SDA,MLP\}0 are trained on NIST,
\{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07.
The human error rate on digits is a lower bound because it does not count digits that were
recognized as letters. For comparison, the results found in the literature
on NIST digits classification using the same test set are included.}
\label{tab:sda-vs-mlp-vs-humans}
\begin{center}
\begin{tabular}{|l|r|r|r|r|} \hline
      & NIST test          & NISTP test       & P07 test       & NIST test digits   \\ \hline
Humans&   18.2\% $\pm$.1\%   &  39.4\%$\pm$.1\%   &  46.9\%$\pm$.1\%  &  $1.4\%$ \\ \hline
SDA0   &  23.7\% $\pm$.14\%  &  65.2\%$\pm$.34\%  & 97.45\%$\pm$.06\%  & 2.7\% $\pm$.14\%\\ \hline
SDA1   &  17.1\% $\pm$.13\%  &  29.7\%$\pm$.3\%  & 29.7\%$\pm$.3\%  & 1.4\% $\pm$.1\%\\ \hline
SDA2   &  18.7\% $\pm$.13\%  &  33.6\%$\pm$.3\%  & 39.9\%$\pm$.17\%  & 1.7\% $\pm$.1\%\\ \hline
MLP0   &  24.2\% $\pm$.15\%  & 68.8\%$\pm$.33\%  & 78.70\%$\pm$.14\%  & 3.45\% $\pm$.15\% \\ \hline
MLP1   &  23.0\% $\pm$.15\%  &  41.8\%$\pm$.35\%  & 90.4\%$\pm$.1\%  & 3.85\% $\pm$.16\% \\ \hline
MLP2   &  24.3\% $\pm$.15\%  &  46.0\%$\pm$.35\%  & 54.7\%$\pm$.17\%  & 4.85\% $\pm$.18\% \\ \hline
\citep{Granger+al-2007} &     &                    &                   & 4.95\% $\pm$.18\% \\ \hline
\citep{Cortes+al-2000} &      &                    &                   & 3.71\% $\pm$.16\% \\ \hline
\citep{Oliveira+al-2002} &    &                    &                   & 2.4\% $\pm$.13\% \\ \hline
\citep{Milgram+al-2005} &      &                    &                   & 2.1\% $\pm$.12\% \\ \hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[ht]
\caption{Relative change in error rates due to the use of perturbed training data,
either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models.
A positive value indicates that training on the perturbed data helped for the
given test set (the first 3 columns on the 62-class tasks and the last one is
on the clean 10-class digits). Clearly, the deep learning models did benefit more
from perturbed training data, even when testing on clean data, whereas the MLP
trained on perturbed data performed worse on the clean digits and about the same
on the clean characters. }
\label{tab:perturbation-effect}
\begin{center}
\begin{tabular}{|l|r|r|r|r|} \hline
      & NIST test          & NISTP test      & P07 test       & NIST test digits   \\ \hline
SDA0/SDA1-1   &  38\%      &  84\%           & 228\%          &  93\% \\ \hline
SDA0/SDA2-1   &  27\%      &  94\%           & 144\%          &  59\% \\ \hline
MLP0/MLP1-1   &  5.2\%     &  65\%           & -13\%          & -10\%  \\ \hline
MLP0/MLP2-1   &  -0.4\%    &  49\%           & 44\%           & -29\% \\ \hline
\end{tabular}
\end{center}
\end{table}

\begin{table}[ht]
\caption{Test error rates and relative change in error rates due to the use of
a multi-task setting, i.e., training on each task in isolation vs training
for all three tasks together, for MLPs vs SDAs. The SDA benefits much
more from the multi-task setting. All experiments on only on the
unperturbed NIST data, using validation error for model selection.
Relative improvement is 1 - single-task error / multi-task error.}
\label{tab:multi-task}
\begin{center}
\begin{tabular}{|l|r|r|r|} \hline
             & single-task  & multi-task  & relative \\
             & setting      & setting     & improvement \\ \hline
MLP-digits   &  3.77\%      &  3.99\%     & 5.6\%   \\ \hline
MLP-lower   &  17.4\%      &  16.8\%     &  -4.1\%    \\ \hline
MLP-upper   &  7.84\%     &  7.54\%      & -3.6\%    \\ \hline
SDA-digits   &  2.6\%      &  3.56\%     & 27\%    \\ \hline
SDA-lower   &  12.3\%      &  14.4\%    & 15\%    \\ \hline
SDA-upper   &  5.93\%     &  6.78\%      & 13\%    \\ \hline
\end{tabular}
\end{center}
\end{table}


\newpage

\vspace*{10mm}
%{\small
\bibliography{strings,strings-short,strings-shorter,ift6266_ml,aigaion-shorter,specials}
%\bibliographystyle{plainnat}
\bibliographystyle{unsrtnat}
%\bibliographystyle{apalike}
%}

\end{document}
author	Yoshua Bengio <bengioy@iro.umontreal.ca>
date	Mon, 20 Dec 2010 11:54:35 -0500
parents	a7193b092b0a
children