# HG changeset patch # User Yoshua Bengio # Date 1275501361 14400 # Node ID ef172f4a322ab5231d7fce3992bcccea447bbce9 # Parent 34cb28249de081c060edcf2934414c6253c1e694 ca fitte diff -r 34cb28249de0 -r ef172f4a322a writeup/nips2010_submission.tex --- a/writeup/nips2010_submission.tex Wed Jun 02 13:30:35 2010 -0400 +++ b/writeup/nips2010_submission.tex Wed Jun 02 13:56:01 2010 -0400 @@ -392,6 +392,7 @@ substantially, with relatively many more digits in the test set, and more uniform distribution of letters in the test set, compared to the training set (in the latter, the letters are distributed more like the natural distribution of letters in text). +\vspace*{-1mm} %\item {\bf Fonts.} @@ -401,6 +402,7 @@ Including the operating system's (Windows 7) fonts, there is a total of $9817$ different fonts that we can choose uniformly from. The chosen {\tt ttf} file is either used as input of the Captcha generator (see next item) or, by producing a corresponding image, directly as input to our models. +\vspace*{-1mm} %\item {\bf Captchas.} @@ -411,6 +413,7 @@ Transformations (slant, distortions, rotation, translation) are applied to each randomly generated character with a complexity depending on the value of the complexity parameter provided by the user of the data source. %Two levels of complexity are allowed and can be controlled via an easy to use facade class. %TODO: what's a facade class? +\vspace*{-1mm} %\item {\bf OCR data.} @@ -429,10 +432,12 @@ All data sets contain 32$\times$32 grey-level images (values in $[0,1]$) associated with a label from one of the 62 character classes. %\begin{itemize} +\vspace*{-1mm} %\item {\bf NIST.} This is the raw NIST special database 19~\citep{Grother-1995}. It has \{651668 / 80000 / 82587\} \{training / validation / test\} examples. +\vspace*{-1mm} %\item {\bf P07.} This dataset is obtained by taking raw characters from all four of the above sources @@ -441,6 +446,7 @@ $25\%$ from the captchas, $25\%$ from the OCR data and $40\%$ from NIST. We apply all the transformations in the order given above, and for each of them we sample uniformly a \emph{complexity} in the range $[0,0.7]$. It has \{81920000 / 80000 / 20000\} \{training / validation / test\} examples. +\vspace*{-1mm} %\item {\bf NISTP.} This one is equivalent to P07 (complexity parameter of $0.7$ with the same proportions of data sources) @@ -471,6 +477,7 @@ rate was chosen among $\{0.001, 0.01, 0.025, 0.075, 0.1, 0.5\}$ through preliminary experiments (measuring performance on a validation set), and $0.1$ was then selected for optimizing on the whole training sets. +\vspace*{-1mm} {\bf Stacked Denoising Auto-Encoders (SDA).} @@ -491,6 +498,7 @@ \begin{figure}[ht] \vspace*{-2mm} \centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}} +\vspace*{-2mm} \caption{Illustration of the computations and training criterion for the denoising auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of the layer (i.e. raw input or output of previous layer) @@ -534,19 +542,20 @@ \begin{figure}[ht] \vspace*{-2mm} \centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}} +\vspace*{-3mm} \caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained on NIST, 1 on NISTP, and 2 on P07. Left: overall results of all models, on NIST and NISTP test sets. Right: error rates on NIST test digits only, along with the previous results from literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005} respectively based on ART, nearest neighbors, MLPs, and SVMs.} - \label{fig:error-rates-charts} \vspace*{-2mm} \end{figure} \section{Experimental Results} +\vspace*{-2mm} %\vspace*{-1mm} %\subsection{SDA vs MLP vs Humans} @@ -572,8 +581,9 @@ and the 10-class (digits) task. \begin{figure}[ht] -\vspace*{-2mm} +\vspace*{-3mm} \centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}} +\vspace*{-3mm} \caption{Relative improvement in error rate due to self-taught learning. Left: Improvement (or loss, when negative) induced by out-of-distribution examples (perturbed data). @@ -650,9 +660,9 @@ \fi -\vspace*{-1mm} +\vspace*{-2mm} \section{Conclusions and Discussion} -\vspace*{-1mm} +\vspace*{-2mm} We have found that the self-taught learning framework is more beneficial to a deep learner than to a traditional shallow and purely @@ -662,7 +672,7 @@ $\bullet$ %\item {\bf Do the good results previously obtained with deep architectures on the -MNIST digits generalize to the setting of a much larger and richer (but similar) +MNIST digits generalize to a much larger and richer (but similar) dataset, the NIST special database 19, with 62 classes and around 800k examples}? Yes, the SDA {\bf systematically outperformed the MLP and all the previously published results on this dataset} (the ones that we are aware of), {\bf in fact reaching human-level