# HG changeset patch # User fsavard # Date 1285883847 14400 # Node ID 8bd4ff0c5c055cf01997ac7779a27497a28f2ea1 # Parent 4672fb6b4385d948ace6a19bd0aadf6c5968ef58# Parent 1538412ee69dd1ce78d11e99219ee381ea3a15ca Correction de merge diff -r 1538412ee69d -r 8bd4ff0c5c05 writeup/mlj_submission.tex --- a/writeup/mlj_submission.tex Thu Sep 30 17:54:23 2010 -0400 +++ b/writeup/mlj_submission.tex Thu Sep 30 17:57:27 2010 -0400 @@ -77,7 +77,7 @@ \end{abstract} %\vspace*{-3mm} -Keywords: self-taught learning, multi-task learning, out-of-distribution examples, deep learning, handwriting recognition. +\keywords{self-taught learning \and multi-task learning \and out-of-distribution examples \and deep learning \and handwriting recognition} \section{Introduction} %\vspace*{-1mm} @@ -129,7 +129,7 @@ learning, often in an greedy layer-wise ``unsupervised pre-training'' stage~\citep{Bengio-2009}. One of these layer initialization techniques, applied here, is the Denoising -Auto-encoder~(DA)~\citep{VincentPLarochelleH2008} (see Figure~\ref{fig:da}), +Auto-encoder~(DA)~\citep{VincentPLarochelleH2008-very-small} (see Figure~\ref{fig:da}), which performed similarly or better than previously proposed Restricted Boltzmann Machines in terms of unsupervised extraction of a hierarchy of features @@ -203,7 +203,7 @@ %\begin{minipage}[b]{0.14\linewidth} %\vspace*{-5mm} \begin{center} -\includegraphics[scale=.4]{images/Original.png}\\ +\includegraphics[scale=.4]{Original.png}\\ {\bf Original} \end{center} \end{wrapfigure} @@ -240,7 +240,7 @@ %\centering \begin{center} \vspace*{-5mm} -\includegraphics[scale=.4]{images/Thick_only.png}\\ +\includegraphics[scale=.4]{Thick_only.png}\\ %{\bf Thickness} \end{center} \vspace{.6cm} @@ -268,7 +268,7 @@ \begin{minipage}[b]{0.14\linewidth} \centering -\includegraphics[scale=.4]{images/Slant_only.png}\\ +\includegraphics[scale=.4]{Slant_only.png}\\ %{\bf Slant} \end{minipage}% \hspace{0.3cm} @@ -290,7 +290,7 @@ %\centering %\begin{wrapfigure}[8]{l}{0.15\textwidth} \begin{center} -\includegraphics[scale=.4]{images/Affine_only.png} +\includegraphics[scale=.4]{Affine_only.png} \vspace*{6mm} %{\small {\bf Affine \mbox{Transformation}}} \end{center} @@ -320,7 +320,7 @@ %\centering \begin{center} \vspace*{5mm} -\includegraphics[scale=.4]{images/Localelasticdistorsions_only.png} +\includegraphics[scale=.4]{Localelasticdistorsions_only.png} %{\bf Local Elastic Deformation} \end{center} %\end{wrapfigure} @@ -347,7 +347,7 @@ %\begin{wrapfigure}[7]{l}{0.15\textwidth} %\vspace*{-5mm} \begin{center} -\includegraphics[scale=.4]{images/Pinch_only.png}\\ +\includegraphics[scale=.4]{Pinch_only.png}\\ \vspace*{15mm} %{\bf Pinch} \end{center} @@ -384,7 +384,7 @@ \begin{minipage}[t]{0.14\linewidth} \centering \vspace*{0mm} -\includegraphics[scale=.4]{images/Motionblur_only.png} +\includegraphics[scale=.4]{Motionblur_only.png} %{\bf Motion Blur} \end{minipage}% \hspace{0.3cm}\begin{minipage}[t]{0.83\linewidth} @@ -405,7 +405,7 @@ \begin{minipage}[t]{0.14\linewidth} \centering \vspace*{3mm} -\includegraphics[scale=.4]{images/occlusion_only.png}\\ +\includegraphics[scale=.4]{occlusion_only.png}\\ %{\bf Occlusion} %%\vspace{.5cm} \end{minipage}% @@ -432,7 +432,7 @@ \begin{center} %\centering \vspace*{6mm} -\includegraphics[scale=.4]{images/Bruitgauss_only.png} +\includegraphics[scale=.4]{Bruitgauss_only.png} %{\bf Gaussian Smoothing} \end{center} %\end{wrapfigure} @@ -468,7 +468,7 @@ %\vspace*{-5mm} \begin{center} \vspace*{1mm} -\includegraphics[scale=.4]{images/Permutpixel_only.png} +\includegraphics[scale=.4]{Permutpixel_only.png} %{\small\bf Permute Pixels} \end{center} %\end{wrapfigure} @@ -495,7 +495,7 @@ %\hspace*{-3mm}\begin{minipage}[t]{0.18\linewidth} %\centering \vspace*{0mm} -\includegraphics[scale=.4]{images/Distorsiongauss_only.png} +\includegraphics[scale=.4]{Distorsiongauss_only.png} %{\small \bf Gauss. Noise} \end{center} %\end{wrapfigure} @@ -517,7 +517,7 @@ \begin{minipage}[t]{0.14\linewidth} \centering \vspace*{0mm} -\includegraphics[scale=.4]{images/background_other_only.png} +\includegraphics[scale=.4]{background_other_only.png} %{\small \bf Bg Image} \end{minipage}% \hspace{0.3cm}\begin{minipage}[t]{0.83\linewidth} @@ -536,7 +536,7 @@ \begin{minipage}[t]{0.14\linewidth} \centering \vspace*{0mm} -\includegraphics[scale=.4]{images/Poivresel_only.png} +\includegraphics[scale=.4]{Poivresel_only.png} %{\small \bf Salt \& Pepper} \end{minipage}% \hspace{0.3cm}\begin{minipage}[t]{0.83\linewidth} @@ -558,7 +558,7 @@ \begin{center} \vspace*{4mm} %\hspace*{-1mm} -\includegraphics[scale=.4]{images/Rature_only.png}\\ +\includegraphics[scale=.4]{Rature_only.png}\\ %{\bf Scratches} \end{center} \end{minipage}% @@ -584,7 +584,7 @@ \begin{minipage}[t]{0.15\linewidth} \centering \vspace*{0mm} -\includegraphics[scale=.4]{images/Contrast_only.png} +\includegraphics[scale=.4]{Contrast_only.png} %{\bf Grey Level \& Contrast} \end{minipage}% \hspace{3mm}\begin{minipage}[t]{0.85\linewidth} @@ -791,7 +791,7 @@ \begin{figure}[ht] %\vspace*{-2mm} -\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}} +\centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{denoising_autoencoder_small.pdf}}} %\vspace*{-2mm} \caption{Illustration of the computations and training criterion for the denoising auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of @@ -840,7 +840,7 @@ \begin{figure}[ht] %\vspace*{-2mm} -\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}} +\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{error_rates_charts.pdf}}} %\vspace*{-3mm} \caption{SDAx are the {\bf deep} models. Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained on NIST, 1 on NISTP, and 2 on P07. Left: overall results @@ -855,7 +855,7 @@ \begin{figure}[ht] %\vspace*{-3mm} -\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}} +\centerline{\resizebox{.99\textwidth}{!}{\includegraphics{improvements_charts.pdf}}} %\vspace*{-3mm} \caption{Relative improvement in error rate due to self-taught learning. Left: Improvement (or loss, when negative) diff -r 1538412ee69d -r 8bd4ff0c5c05 writeup/mlj_submission/mlj_submission.tex --- a/writeup/mlj_submission/mlj_submission.tex Thu Sep 30 17:54:23 2010 -0400 +++ b/writeup/mlj_submission/mlj_submission.tex Thu Sep 30 17:57:27 2010 -0400 @@ -73,11 +73,11 @@ %\vspace*{-2mm} \begin{abstract} - Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition. + Recent theoretical and empirical work in statistical machine learning has demonstrated the potential of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple levels of representation. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition. \end{abstract} %\vspace*{-3mm} -Keywords: self-taught learning, multi-task learning, out-of-distribution examples, deep learning, handwriting recognition. +\keywords{self-taught learning \and multi-task learning \and out-of-distribution examples \and deep learning \and handwriting recognition} \section{Introduction} %\vspace*{-1mm}