diff writeup/nips2010_submission.tex @ 541:8aad1c6ec39a

reduction espace
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Wed, 02 Jun 2010 10:23:33 -0400
parents 84f42fe05594
children 1cdfc17e890f
line wrap: on
line diff
--- a/writeup/nips2010_submission.tex	Wed Jun 02 01:34:49 2010 -0400
+++ b/writeup/nips2010_submission.tex	Wed Jun 02 10:23:33 2010 -0400
@@ -143,6 +143,7 @@
 part, from blur to contrast, adds different kinds of noise.
 
 \begin{figure}[ht]
+\vspace*{-2mm}
 \centerline{\resizebox{.9\textwidth}{!}{\includegraphics{images/transfo.png}}}
 % TODO: METTRE LE NOM DE LA TRANSFO A COTE DE CHAQUE IMAGE
 \caption{Illustration of each transformation applied alone to the same image
@@ -153,21 +154,18 @@
 background image, salt and pepper noise, spatially Gaussian noise, scratches,
 grey level and contrast changes.}
 \label{fig:transfo}
+\vspace*{-2mm}
 \end{figure}
 
 {\large\bf Transformations}
 
-\vspace*{2mm}
+\vspace*{0.5mm}
 
 {\bf Slant.} 
-We mimic slant by shifting each row of the image
+Each row of the image is shifted
 proportionally to its height: $shift = round(slant \times height)$.  
-The $slant$ coefficient can be negative or positive with equal probability
-and its value is randomly sampled according to the complexity level:
-$slant \sim U[0,complexity]$, so the
-maximum displacement for the lowest or highest pixel line is of
-$round(complexity \times 32)$.
-\vspace*{0mm}
+$slant \sim U[-complexity,complexity]$.
+\vspace*{-1mm}
 
 {\bf Thickness.}
 Morphological operators of dilation and erosion~\citep{Haralick87,Serra82}
@@ -177,46 +175,38 @@
 matrix, respectively for dilation or erosion. Ten different structural elements with 
 increasing dimensions (largest is $5\times5$) were used.  For each image, 
 randomly sample the operator type (dilation or erosion) with equal probability and one structural
-element from a subset of the $n$ smallest structuring elements where $n$ is
-$round(10 \times complexity)$ for dilation and $round(6 \times complexity)$
-for erosion.  A neutral element is always present in the set, and if it is
-chosen no transformation is applied.  Erosion allows only the six
-smallest structural elements because when the character is too thin it may
-be completely erased.
-\vspace*{0mm}
+element from a subset of the $n=round(m \times complexity)$ smallest structuring elements
+where $m=10$ for dilation and $m=6$ for erosion (to avoid completely erasing thin characters).  
+A neutral element (no transformation) 
+is always present in the set. is applied.  
+\vspace*{-1mm}
 
 {\bf Affine Transformations.}
 A $2 \times 3$ affine transform matrix (with
 6 parameters $(a,b,c,d,e,f)$) is sampled according to the $complexity$ level.
-Each pixel $(x,y)$ of the output image takes the value of the pixel
-nearest to $(ax+by+c,dx+ey+f)$ in the input image.  This 
-produces scaling, translation, rotation and shearing.
+Output pixel $(x,y)$ takes the value of input pixel
+nearest to $(ax+by+c,dx+ey+f)$,
+producing scaling, translation, rotation and shearing.
 The marginal distributions of $(a,b,c,d,e,f)$ have been tuned by hand to
 forbid important rotations (not to confuse classes) but to give good
 variability of the transformation: $a$ and $d$ $\sim U[1-3 \times
 complexity,1+3 \times complexity]$, $b$ and $e$ $\sim[-3 \times complexity,3
 \times complexity]$ and $c$ and $f$ $\sim U[-4 \times complexity, 4 \times
 complexity]$.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Local Elastic Deformations.}
 This filter induces a ``wiggly'' effect in the image, following~\citet{SimardSP03-short},
 which provides more details. 
-Two ``displacements'' fields are generated and applied, for horizontal
-and vertical displacements of pixels. 
-To generate a pixel in either field, first a value between -1 and 1 is
-chosen from a uniform distribution. Then all the pixels, in both fields, are
-multiplied by a constant $\alpha$ which controls the intensity of the
-displacements (larger $\alpha$ translates into larger wiggles).
-Each field is convoluted with a Gaussian 2D kernel of
-standard deviation $\sigma$. Visually, this results in a blur.
-$\alpha = \sqrt[3]{complexity} \times 10.0$ and $\sigma = 10 - 7 \times
-\sqrt[3]{complexity}$.
-\vspace*{0mm}
+The intensity of the displacement fields is given by 
+$\alpha = \sqrt[3]{complexity} \times 10.0$, which are 
+convolved with a Gaussian 2D kernel (resulting in a blur) of
+standard deviation $\sigma = 10 - 7 \times\sqrt[3]{complexity}$.
+\vspace*{-1mm}
 
 {\bf Pinch.}
-This is a GIMP filter called ``Whirl and
-pinch'', but whirl was set to 0. A pinch is ``similar to projecting the image onto an elastic
+This is the ``Whirl and pinch'' GIMP filter but with whirl was set to 0. 
+A pinch is ``similar to projecting the image onto an elastic
 surface and pressing or pulling on the center of the surface'' (GIMP documentation manual).
 For a square input image, this is akin to drawing a circle of
 radius $r$ around a center point $C$. Any point (pixel) $P$ belonging to
@@ -230,11 +220,11 @@
 around the (non-integer) source position thus found.
 Here $pinch \sim U[-complexity, 0.7 \times complexity]$.
 
-\vspace*{1mm}
+\vspace*{0.5mm}
 
 {\large\bf Injecting Noise}
 
-\vspace*{1mm}
+\vspace*{0.5mm}
 
 {\bf Motion Blur.}
 This is a ``linear motion blur'' in GIMP
@@ -242,7 +232,7 @@
 a pixel in the final image is approximately the  mean value of the $length$ first pixels
 found by moving in the $angle$ direction. 
 Here $angle \sim U[0,360]$ degrees, and $length \sim {\rm Normal}(0,(3 \times complexity)^2)$.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Occlusion.}
 Selects a random rectangle from an {\em occluder} character
@@ -253,7 +243,7 @@
 The destination position in the occluded image are also sampled
 according to a normal distribution (see more details in~\citet{ift6266-tr-anonymous}).
 This filter has a probability of 60\% of not being applied.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Pixel Permutation.}
 This filter permutes neighbouring pixels. It selects first
@@ -263,13 +253,13 @@
 from more than 1 if the number of selected pixels is not a multiple of 4.
 % TODO: The previous sentence is hard to parse
 This filter has a probability of 80\% of not being applied.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Gaussian Noise.}
 This filter simply adds, to each pixel of the image independently, a
 noise $\sim Normal(0(\frac{complexity}{10})^2)$.
 It has a probability of 70\% of not being applied.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Background Images.}
 Following~\citet{Larochelle-jmlr-2009}, this transformation adds a random
@@ -284,13 +274,13 @@
 Each background pixel value is multiplied by $\frac{max(maximage -
   contrast, 0)}{maxbg}$ (higher contrast yield darker
 background). The output image pixels are max(background,original).
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Salt and Pepper Noise.}
 This filter adds noise $\sim U[0,1]$ to random subsets of pixels.
 The number of selected pixels is $0.2 \times complexity$.
 This filter has a probability of not being applied at all of 75\%.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Spatially Gaussian Noise.}
 Different regions of the image are spatially smoothed.
@@ -306,7 +296,7 @@
 computed from the following element-wise operation: $\frac{image + filtered
   image \times mask}{mask+1}$.
 This filter has a probability of not being applied at all of 75\%.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Scratches.}
 The scratches module places line-like white patches on the image.  The
@@ -322,7 +312,7 @@
 cases, two patches are generated, and otherwise three patches are
 generated. The patch is applied by taking the maximal value on any given
 patch or the original image, for each of the 32x32 pixel locations.
-\vspace*{0mm}
+\vspace*{-1mm}
 
 {\bf Grey Level and Contrast Changes.}
 This filter changes the contrast and may invert the image polarity (white
@@ -487,6 +477,7 @@
 and $0.1$ was then selected for optimizing on the whole training sets.
 
 \begin{figure}[ht]
+\vspace*{-2mm}
 \centerline{\resizebox{0.8\textwidth}{!}{\includegraphics{images/denoising_autoencoder_small.pdf}}}
 \caption{Illustration of the computations and training criterion for the denoising
 auto-encoder used to pre-train each layer of the deep architecture. Input $x$ of
@@ -497,6 +488,7 @@
 $L_H(x,z)$, whose expected value is approximately minimized during training
 by tuning $\theta$ and $\theta'$.}
 \label{fig:da}
+\vspace*{-2mm}
 \end{figure}
 
 {\bf Stacked Denoising Auto-Encoders (SDA).}
@@ -543,6 +535,7 @@
 \vspace*{-1mm}
 
 \begin{figure}[ht]
+\vspace*{-2mm}
 \centerline{\resizebox{.99\textwidth}{!}{\includegraphics{images/error_rates_charts.pdf}}}
 \caption{Error bars indicate a 95\% confidence interval. 0 indicates that the model was trained
 on NIST, 1 on NISTP, and 2 on P07. Left: overall results
@@ -552,7 +545,7 @@
 respectively based on ART, nearest neighbors, MLPs, and SVMs.}
 
 \label{fig:error-rates-charts}
-\vspace*{-1mm}
+\vspace*{-2mm}
 \end{figure}