# HG changeset patch # User Dumitru Erhan # Date 1275416137 25200 # Node ID 460a4e78c9a411e09c5784d387364f258ab3cd5e # Parent 0a5945249f2b02fe9bf307540170b857f29963d0# Parent 092dae9a50405283f19a965cc4fecebd5180317c merging is fun, merging is fun, merging is fun diff -r 0a5945249f2b -r 460a4e78c9a4 writeup/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/writeup/Makefile Tue Jun 01 11:15:37 2010 -0700 @@ -0,0 +1,9 @@ +all: nips2010_submission.pdf + +nips2010_submission.pdf: nips2010_submission.tex + pdflatex nips2010_submission.tex + pdflatex nips2010_submission.tex + bibtex -min-crossrefs=999 nips2010_submission + pdflatex nips2010_submission.tex + pdflatex nips2010_submission.tex + bibtex -min-crossrefs=999 nips2010_submission \ No newline at end of file diff -r 0a5945249f2b -r 460a4e78c9a4 writeup/ift6266_ml.bib --- a/writeup/ift6266_ml.bib Tue Jun 01 11:14:48 2010 -0700 +++ b/writeup/ift6266_ml.bib Tue Jun 01 11:15:37 2010 -0700 @@ -17853,7 +17853,7 @@ @InProceedings{ranzato-07-small, author = "M. Ranzato and C. Poultney and S. Chopra and Y. {LeCun}", - booktitle = "NIPS 19", + booktitle = "NIPS'06", title = "Efficient Learning of Sparse Representations with an Energy-Based Model", year = "2007", @@ -25724,6 +25724,16 @@ issn = "0162-8828", } +@Article{Oliveira+al-2002-short, + author = "Oliveira, L.S. and Sabourin, R. and Bortolozzi, F. and Suen, C.Y.", + title = "Automatic recognition of handwritten numerical strings: a recognition and verification strategy", + journal = ieeetpami, + volume = "24", + number = "11", + pages = "1438-1454", + year = "2002", +} + @inproceedings{SimardSP03, author = {Patrice Simard and David Steinkraus and diff -r 0a5945249f2b -r 460a4e78c9a4 writeup/nips2010_submission.tex --- a/writeup/nips2010_submission.tex Tue Jun 01 11:14:48 2010 -0700 +++ b/writeup/nips2010_submission.tex Tue Jun 01 11:15:37 2010 -0700 @@ -90,6 +90,10 @@ of {\em out-of-distribution} examples and of the multi-task setting (but see~\citep{CollobertR2008}). In particular the {\em relative advantage} of deep learning for this settings has not been evaluated. +The hypothesis explored here is that a deep hierarchy of features +may be better able to provide sharing of statistical strength +between different regions in input space or different tasks, +as discussed in the conclusion. % TODO: why we care to evaluate this relative advantage @@ -320,7 +324,7 @@ \vspace*{-1mm} Whereas much previous work on deep learning algorithms had been performed on -the MNIST digits classification task~\citep{Hinton06,ranzato-07,Bengio-nips-2006,Salakhutdinov+Hinton-2009}, +the MNIST digits classification task~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006,Salakhutdinov+Hinton-2009}, with 60~000 examples, and variants involving 10~000 examples~\citep{Larochelle-jmlr-toappear-2008,VincentPLarochelleH2008}, we want to focus here on the case of much larger training sets, from 10 times to @@ -356,12 +360,12 @@ {\bf NIST.} Our main source of characters is the NIST Special Database 19~\citep{Grother-1995}, widely used for training and testing character -recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002,Milgram+al-2005}. +recognition systems~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005}. The dataset is composed with 814255 digits and characters (upper and lower cases), with hand checked classifications, extracted from handwritten sample forms of 3600 writers. The characters are labelled by one of the 62 classes corresponding to "0"-"9","A"-"Z" and "a"-"z". The dataset contains 8 series of different complexity. The fourth series, $hsf_4$, experimentally recognized to be the most difficult one is recommended -by NIST as testing set and is used in our work and some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002,Milgram+al-2005} +by NIST as testing set and is used in our work and some previous work~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005} for that purpose. We randomly split the remainder into a training set and a validation set for model selection. The sizes of these data sets are: 651668 for training, 80000 for validation, and 82587 for testing. @@ -450,7 +454,7 @@ {\bf Stacked Denoising Auto-Encoders (SDA).} Various auto-encoder variants and Restricted Boltzmann Machines (RBMs) can be used to initialize the weights of each layer of a deep MLP (with many hidden -layers)~\citep{Hinton06,ranzato-07,Bengio-nips-2006} +layers)~\citep{Hinton06,ranzato-07-small,Bengio-nips-2006} enabling better generalization, apparently setting parameters in the basin of attraction of supervised gradient descent yielding better generalization~\citep{Erhan+al-2010}. It is hypothesized that the @@ -498,7 +502,7 @@ SDA2), along with the previous results on the digits NIST special database 19 test set from the literature respectively based on ARTMAP neural networks ~\citep{Granger+al-2007}, fast nearest-neighbor search -~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002}, and SVMs +~\citep{Cortes+al-2000}, MLPs ~\citep{Oliveira+al-2002-short}, and SVMs ~\citep{Milgram+al-2005}. More detailed and complete numerical results (figures and tables, including standard errors on the error rates) can be found in the supplementary material. The 3 kinds of model differ in the @@ -543,7 +547,7 @@ of all models, on 3 different test sets corresponding to the three datasets. Right: error rates on NIST test digits only, along with the previous results from -literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002,Milgram+al-2005} +literature~\citep{Granger+al-2007,Cortes+al-2000,Oliveira+al-2002-short,Milgram+al-2005} respectively based on ART, nearest neighbors, MLPs, and SVMs.} \label{fig:error-rates-charts}