# HG changeset patch
# User Yoshua Bengio <bengioy@iro.umontreal.ca>
# Date 1272376578 14400
# Node ID 5f8fffd7347fbc93903d4e1a8fdfbef5dea739e6
# Parent  d76c85ba12d69f4fa55540f0fc2e0cd6b690587d
possible image for illustrating perturbations

diff -r d76c85ba12d6 -r 5f8fffd7347f writeup/images/example_t.png
Binary file writeup/images/example_t.png has changed
diff -r d76c85ba12d6 -r 5f8fffd7347f writeup/ml.bib
--- a/writeup/ml.bib	Tue Apr 27 08:55:30 2010 -0400
+++ b/writeup/ml.bib	Tue Apr 27 09:56:18 2010 -0400
@@ -2420,7 +2420,9 @@
   title =        "Learning Deep Architectures for {AI}",
   journal =  {Foundations \& Trends in Mach. Learn.},
   year =         "2009",
-  volume = {to appear},
+  volume = 2,
+  number = 1,
+  pages = {1--127},
 }
 
 @TechReport{Bengio-TR1312-small,
diff -r d76c85ba12d6 -r 5f8fffd7347f writeup/techreport.tex
--- a/writeup/techreport.tex	Tue Apr 27 08:55:30 2010 -0400
+++ b/writeup/techreport.tex	Tue Apr 27 09:56:18 2010 -0400
@@ -12,11 +12,64 @@
 \maketitle
 
 \begin{abstract}
-
+Recent theoretical and empirical work in statistical machine learning has
+demonstrated the importance of learning algorithms for deep
+architectures, i.e., function classes obtained by composing multiple
+non-linear transformations. In the area of handwriting recognition,
+deep learning algorithms
+had been evaluated on rather small datasets with a few tens of thousands
+of examples. Here we propose a powerful generator of variations
+of examples for character images based on a pipeline of stochastic
+transformations that include not only the usual affine transformations
+but also the addition of slant, local elastic deformations, changes
+in thickness, background images, color, contrast, occlusion, and
+various types of pixel and spatially correlated noise.
+We evaluate a deep learning algorithm (Stacked Denoising Autoencoders)
+on the task of learning to classify digits and letters transformed
+with this pipeline, using the hundreds of millions of generated examples
+and testing on the full NIST test set.
+We find that the SDA outperforms its
+shallow counterpart, an ordinary Multi-Layer Perceptron,
+and that it is better able to take advantage of the additional
+generated data.
 \end{abstract}
 
 \section{Introduction}
 
+Deep Learning has emerged as a promising new area of research in
+statistical machine learning (see~\emcite{Bengio-2009} for a review).
+Learning algorithms for deep architectures are centered on the learning
+of useful representations of data, which are better suited to the task at hand.
+This is in great part inspired by observations of the mammalian visual cortex, 
+which consists of a chain of processing elements, each of which is associated with a
+different representation. In fact,
+it was found recently that the features learnt in deep architectures resemble
+those observed in the first two of these stages (in areas V1 and V2
+of visual cortex)~\cite{HonglakL2008}.
+Processing images typically involves transforming the raw pixel data into
+new {\bf representations} that can be used for analysis or classification.
+For example, a principal component analysis representation linearly projects 
+the input image into a lower-dimensional feature space.
+Why learn a representation?  Current practice in the computer vision
+literature converts the raw pixels into a hand-crafted representation
+(e.g.\ SIFT features~\cite{Lowe04}), but deep learning algorithms
+tend to discover similar features in their first few 
+levels~\cite{HonglakL2008,ranzato-08,Koray-08,VincentPLarochelleH2008-very-small}.
+Learning increases the
+ease and practicality of developing representations that are at once
+tailored to specific tasks, yet are able to borrow statistical strength
+from other related tasks (e.g., modeling different kinds of objects). Finally, learning the
+feature representation can lead to higher-level (more abstract, more
+general) features that are more robust to unanticipated sources of
+variance extant in real data.
+
+Whereas a deep architecture can in principle be more powerful than a shallow
+one in terms of representation, depth appears to render the training problem
+more difficult in terms of optimization and local minima.
+It is also only recently that
+successful algorithms were proposed to overcome some of these
+difficulties.
+
 \section{Perturbation and Transformation of Character Images}
 
 \subsection{Affine Transformations}
@@ -60,7 +113,16 @@
 
 \section{Experimental Results}
 
-\subsection{SDAE vs MLP}
+\subsection{SDA vs MLP}
+
+\begin{center}
+\begin{tabular}{lcc}
+      & train w/   & train w/    \\
+      & NIST       & P07 + NIST  \\ \hline 
+SDA   &            &             \\ \hline 
+MLP   &            &             \\ \hline 
+\end{tabular}
+\end{center}
 
 \subsection{Perturbed Training Data More Helpful for SDAE}
 
@@ -68,7 +130,7 @@
 
 \section{Conclusions}
 
-\bibliography{strings,ml}
+\bibliography{strings,ml,aigaion}
 \bibliographystyle{mlapa}
 
 \end{document}
\ No newline at end of file