Mercurial > ift6266
view writeup/nips2010_submission_supplementary.tex @ 612:21d53fd07f6e
reviews AISTATS
author | Yoshua Bengio <bengioy@iro.umontreal.ca> |
---|---|
date | Mon, 20 Dec 2010 11:54:35 -0500 |
parents | a7193b092b0a |
children |
line wrap: on
line source
\documentclass{article} % For LaTeX2e \usepackage{nips10submit_e,times} \usepackage{amsthm,amsmath,amssymb,bbold,bbm} \usepackage{algorithm,algorithmic} \usepackage[utf8]{inputenc} \usepackage{graphicx,subfigure} \usepackage[numbers]{natbib} \title{Deep Self-Taught Learning for Handwritten Character Recognition\\ \emph{Supplementary Material}} \begin{document} \maketitle \section*{Appendix I: Full results} These tables correspond to Figures 2 and 3 and contain the raw error rates for each model and dataset considered. They also contain additional data such as test errors on P07 and standard errors. \begin{table}[ht] \caption{Overall comparison of error rates ($\pm$ std.err.) on 62 character classes (10 digits + 26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training (SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture (MLP=Multi-Layer Perceptron). The models shown are all trained using perturbed data (NISTP or P07) and using a validation set to select hyper-parameters and other training choices. \{SDA,MLP\}0 are trained on NIST, \{SDA,MLP\}1 are trained on NISTP, and \{SDA,MLP\}2 are trained on P07. The human error rate on digits is a lower bound because it does not count digits that were recognized as letters. For comparison, the results found in the literature on NIST digits classification using the same test set are included.} \label{tab:sda-vs-mlp-vs-humans} \begin{center} \begin{tabular}{|l|r|r|r|r|} \hline & NIST test & NISTP test & P07 test & NIST test digits \\ \hline Humans& 18.2\% $\pm$.1\% & 39.4\%$\pm$.1\% & 46.9\%$\pm$.1\% & $1.4\%$ \\ \hline SDA0 & 23.7\% $\pm$.14\% & 65.2\%$\pm$.34\% & 97.45\%$\pm$.06\% & 2.7\% $\pm$.14\%\\ \hline SDA1 & 17.1\% $\pm$.13\% & 29.7\%$\pm$.3\% & 29.7\%$\pm$.3\% & 1.4\% $\pm$.1\%\\ \hline SDA2 & 18.7\% $\pm$.13\% & 33.6\%$\pm$.3\% & 39.9\%$\pm$.17\% & 1.7\% $\pm$.1\%\\ \hline MLP0 & 24.2\% $\pm$.15\% & 68.8\%$\pm$.33\% & 78.70\%$\pm$.14\% & 3.45\% $\pm$.15\% \\ \hline MLP1 & 23.0\% $\pm$.15\% & 41.8\%$\pm$.35\% & 90.4\%$\pm$.1\% & 3.85\% $\pm$.16\% \\ \hline MLP2 & 24.3\% $\pm$.15\% & 46.0\%$\pm$.35\% & 54.7\%$\pm$.17\% & 4.85\% $\pm$.18\% \\ \hline \citep{Granger+al-2007} & & & & 4.95\% $\pm$.18\% \\ \hline \citep{Cortes+al-2000} & & & & 3.71\% $\pm$.16\% \\ \hline \citep{Oliveira+al-2002} & & & & 2.4\% $\pm$.13\% \\ \hline \citep{Milgram+al-2005} & & & & 2.1\% $\pm$.12\% \\ \hline \end{tabular} \end{center} \end{table} \begin{table}[ht] \caption{Relative change in error rates due to the use of perturbed training data, either using NISTP, for the MLP1/SDA1 models, or using P07, for the MLP2/SDA2 models. A positive value indicates that training on the perturbed data helped for the given test set (the first 3 columns on the 62-class tasks and the last one is on the clean 10-class digits). Clearly, the deep learning models did benefit more from perturbed training data, even when testing on clean data, whereas the MLP trained on perturbed data performed worse on the clean digits and about the same on the clean characters. } \label{tab:perturbation-effect} \begin{center} \begin{tabular}{|l|r|r|r|r|} \hline & NIST test & NISTP test & P07 test & NIST test digits \\ \hline SDA0/SDA1-1 & 38\% & 84\% & 228\% & 93\% \\ \hline SDA0/SDA2-1 & 27\% & 94\% & 144\% & 59\% \\ \hline MLP0/MLP1-1 & 5.2\% & 65\% & -13\% & -10\% \\ \hline MLP0/MLP2-1 & -0.4\% & 49\% & 44\% & -29\% \\ \hline \end{tabular} \end{center} \end{table} \begin{table}[ht] \caption{Test error rates and relative change in error rates due to the use of a multi-task setting, i.e., training on each task in isolation vs training for all three tasks together, for MLPs vs SDAs. The SDA benefits much more from the multi-task setting. All experiments on only on the unperturbed NIST data, using validation error for model selection. Relative improvement is 1 - single-task error / multi-task error.} \label{tab:multi-task} \begin{center} \begin{tabular}{|l|r|r|r|} \hline & single-task & multi-task & relative \\ & setting & setting & improvement \\ \hline MLP-digits & 3.77\% & 3.99\% & 5.6\% \\ \hline MLP-lower & 17.4\% & 16.8\% & -4.1\% \\ \hline MLP-upper & 7.84\% & 7.54\% & -3.6\% \\ \hline SDA-digits & 2.6\% & 3.56\% & 27\% \\ \hline SDA-lower & 12.3\% & 14.4\% & 15\% \\ \hline SDA-upper & 5.93\% & 6.78\% & 13\% \\ \hline \end{tabular} \end{center} \end{table} \newpage \vspace*{10mm} %{\small \bibliography{strings,strings-short,strings-shorter,ift6266_ml,aigaion-shorter,specials} %\bibliographystyle{plainnat} \bibliographystyle{unsrtnat} %\bibliographystyle{apalike} %} \end{document}