changeset 438:a6d339033d03

added AMT
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 03 May 2010 07:46:18 -0400
parents 479f2f518fc9
children 5ca2936f2062
files writeup/ml.bib writeup/techreport.tex
diffstat 2 files changed, 54 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/writeup/ml.bib	Mon May 03 06:17:54 2010 -0400
+++ b/writeup/ml.bib	Mon May 03 07:46:18 2010 -0400
@@ -25727,3 +25727,27 @@
 }
 
 
+@inproceedings{SnowEtAl2008,
+    author = {Snow, R. and O'Connor, B. and Jurafsky, D. and Ng, A.},
+    booktitle = {Proc. Empirical Methods in NLP},
+    pages = {254--263},
+    title = {Cheap and Fast -- But is it Good? Evaluating Non-Expert Annotations for Natural Language Tasks},
+    year = {2008}
+}
+
+
+@inproceedings{SorokinAndForsyth2008,
+    author = {Sorokin, A. and Forsyth, D.},
+    booktitle = {CVPR Workshops},
+    pages = {1--8},
+    title = {Utility data annotation with Amazon Mechanical Turk},
+    year = {2008}
+}
+
+@inproceedings{ whitehill09,
+ title = {Whose Vote Should Count More: Optimal Integration of Labels from Labelers of Unknown Expertise},
+ author = {J. Whitehill and P. Ruvolo and T. Wu and J. Bergsma and J. Movellan},
+ booktitle = {NIPS 22},
+ pages = {2035--2043},
+ year = 2009
+}
--- a/writeup/techreport.tex	Mon May 03 06:17:54 2010 -0400
+++ b/writeup/techreport.tex	Mon May 03 07:46:18 2010 -0400
@@ -31,7 +31,10 @@
 We find that the SDA outperforms its
 shallow counterpart, an ordinary Multi-Layer Perceptron,
 and that it is better able to take advantage of the additional
-generated data.
+generated data, as well as better able to take advantage of
+training from more classes than those of interest in the end.
+In fact, we find that the SDA reaches human performance as
+estimated by the Amazon Mechanical Turk on the NIST test characters.
 \end{abstract}
 
 \section{Introduction}
@@ -325,16 +328,36 @@
 
 \section{Experimental Results}
 
-\subsection{SDA vs MLP}
+\subsection{SDA vs MLP vs Humans}
 
+We compare here the best MLP (according to validation set error) that we found against
+the best SDA (again according to validation set error), along with a precise estimate
+of human performance obtained via Amazon's Mechanical Turk (AMT)
+service\footnote{http://mturk.com}. AMT users are paid small amounts
+of money to perform tasks for which human intelligence is required.
+Mechanical Turk has been used extensively in natural language
+processing \cite{SnowEtAl2008} and vision
+\cite{SorokinAndForsyth2008,whitehill09}. AMT users where presented
+with 10 character images and asked to type 10 corresponding ascii
+characters. Hence they were forced to make a hard choice among the
+62 character classes. Three users classified each image, allowing
+to estimate inter-human variability (shown as +/- in parenthesis below).
+
+\begin{table}
+\caption{Overall comparison of error rates on 62 character classes (10 digits +
+26 lower + 26 upper), except for last columns -- digits only, between deep architecture with pre-training
+(SDA=Stacked Denoising Autoencoder) and ordinary shallow architecture 
+(MLP=Multi-Layer Perceptron). }
+\label{tab:sda-vs-mlp-vs-humans}
 \begin{center}
-\begin{tabular}{lcc}
-      & train w/   & train w/    \\
-      & NIST       & P07 + NIST  \\ \hline 
-SDA   &            &             \\ \hline 
-MLP   &            &             \\ \hline 
+\begin{tabular}{|l|r|r|r|r|} \hline
+      & NIST test & NISTP test & P07 test  & NIST test digits   \\ \hline
+Humans&            &           &   & \\ \hline 
+SDA   &            &           &  &\\ \hline 
+MLP   &            &           &  & \\ \hline 
 \end{tabular}
 \end{center}
+\end{table}
 
 \subsection{Perturbed Training Data More Helpful for SDAE}