# HG changeset patch # User Yoshua Bengio # Date 1275415502 14400 # Node ID 66a905508e34fc01f1e249d33ea5a5c39e1f8230 # Parent 6f042a71be239e9b4dac9e2b1e2816fb5e2b967a# Parent 8c2ab4f246b19be8d62c02fa4b2ff50a9f35242e resolved merge conflict diff -r 6f042a71be23 -r 66a905508e34 writeup/ift6266_ml.bib --- a/writeup/ift6266_ml.bib Tue Jun 01 14:02:04 2010 -0400 +++ b/writeup/ift6266_ml.bib Tue Jun 01 14:05:02 2010 -0400 @@ -267,14 +267,6 @@ mixture that has a dominant tail", } -@techreport{ift6266-tr-anonymous, - author = "Anonymous authors", - title = "Generating and Exploiting Perturbed and Multi-Task Handwritten -Training Data for Deep Architectures", - institution = "University X.", - year = 2010, -} - @TechReport{Abdallah+Plumbley-06, author = "Samer Abdallah and Mark Plumbley", title = "Geometry Dependency Analysis", diff -r 6f042a71be23 -r 66a905508e34 writeup/nips2010_submission.tex --- a/writeup/nips2010_submission.tex Tue Jun 01 14:02:04 2010 -0400 +++ b/writeup/nips2010_submission.tex Tue Jun 01 14:05:02 2010 -0400 @@ -20,7 +20,7 @@ Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple - non-linear transformations. Self-taught learning (exploiting unlabeled + non-linear transformations. The self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution @@ -74,8 +74,8 @@ performed similarly or better than previously proposed Restricted Boltzmann Machines in terms of unsupervised extraction of a hierarchy of features useful for classification. The principle is that each layer starting from -the bottom is trained to encode its input (the output of the previous -layer) and to reconstruct it from a corrupted version of it. After this +the bottom is trained to encode their input (the output of the previous +layer) and try to reconstruct it from a corrupted version of it. After this unsupervised initialization, the stack of denoising auto-encoders can be converted into a deep supervised feedforward neural network and fine-tuned by stochastic gradient descent. @@ -119,7 +119,7 @@ a corresponding shallow and purely supervised architecture? %\end{enumerate} -Our experimental results provide evidence to support positive answers to all of these questions. +The experimental results presented here provide positive evidence towards all of these questions. \vspace*{-1mm} \section{Perturbation and Transformation of Character Images} @@ -204,7 +204,7 @@ {\bf Pinch.} This GIMP filter is named "Whirl and pinch", but whirl was set to 0. A pinch is ``similar to projecting the image onto an elastic -surface and pressing or pulling on the center of the surface'' (GIMP documentation manual). +surface and pressing or pulling on the center of the surface''~\citep{GIMP-manual}. For a square input image, think of drawing a circle of radius $r$ around a center point $C$. Any point (pixel) $P$ belonging to that disk (region inside circle) will have its value recalculated by taking @@ -338,10 +338,9 @@ service\footnote{http://mturk.com}. AMT users are paid small amounts of money to perform tasks for which human intelligence is required. -Mechanical Turk has been used extensively in natural language processing and vision. -%processing \citep{SnowEtAl2008} and vision -%\citep{SorokinAndForsyth2008,whitehill09}. -%\citep{SorokinAndForsyth2008,whitehill09}. +Mechanical Turk has been used extensively in natural language +processing \citep{SnowEtAl2008} and vision +\citep{SorokinAndForsyth2008,whitehill09}. AMT users where presented with 10 character images and asked to type 10 corresponding ASCII characters. They were forced to make a hard choice among the @@ -587,7 +586,13 @@ \begin{figure}[h] \resizebox{.99\textwidth}{!}{\includegraphics{images/improvements_charts.pdf}}\\ -\caption{Charts corresponding to tables 2 (left) and 3 (right), from Appendix I.} +\caption{Relative improvement in error rate due to self-taught learning. +Left: Improvement (or loss, when negative) +induced by out-of-distribution examples (perturbed data). +Right: Improvement (or loss, when negative) induced by multi-task +learning (training on all classes and testing only on either digits, +upper case, or lower-case). The deep learner (SDA) benefits more from +both self-taught learning scenarios, compared to the shallow MLP.} \label{fig:improvements-charts} \end{figure}