comparison writeup/mlj_submission.tex @ 587:b1be957dd1be

Added mlj_submission to group every file needed for that.
author fsavard
date Thu, 30 Sep 2010 17:51:02 -0400
parents 4933077b8676
children 9a6abcf143e8
comparison
equal deleted inserted replaced
585:4933077b8676 587:b1be957dd1be
1 \documentclass{article} % For LaTeX2e 1 \RequirePackage{fix-cm} % from template
2
3 %\documentclass{article} % For LaTeX2e
4 \documentclass[smallcondensed]{svjour3} % onecolumn (ditto)
5
2 \usepackage{times} 6 \usepackage{times}
3 \usepackage{wrapfig} 7 \usepackage{wrapfig}
4 \usepackage{amsthm,amsmath,bbm} 8 %\usepackage{amsthm} % not to be used with springer tools
9 \usepackage{amsmath}
10 \usepackage{bbm}
5 \usepackage[psamsfonts]{amssymb} 11 \usepackage[psamsfonts]{amssymb}
6 \usepackage{algorithm,algorithmic} 12 %\usepackage{algorithm,algorithmic} % not used after all
7 \usepackage[utf8]{inputenc} 13 \usepackage[utf8]{inputenc}
8 \usepackage{graphicx,subfigure} 14 \usepackage{graphicx,subfigure}
9 \usepackage[numbers]{natbib} 15 \usepackage{natbib} % was [numbers]{natbib}
10 16
11 \addtolength{\textwidth}{10mm} 17 \addtolength{\textwidth}{10mm}
12 \addtolength{\evensidemargin}{-5mm} 18 \addtolength{\evensidemargin}{-5mm}
13 \addtolength{\oddsidemargin}{-5mm} 19 \addtolength{\oddsidemargin}{-5mm}
14 20
15 %\setlength\parindent{0mm} 21 %\setlength\parindent{0mm}
16 22
17 \title{Deep Self-Taught Learning for Handwritten Character Recognition} 23 \title{Deep Self-Taught Learning for Handwritten Character Recognition}
18 \author{ 24 \author{
25 Yoshua Bengio \and
19 Frédéric Bastien \and 26 Frédéric Bastien \and
20 Yoshua Bengio \and
21 Arnaud Bergeron \and 27 Arnaud Bergeron \and
22 Nicolas Boulanger-Lewandowski \and 28 Nicolas Boulanger-Lewandowski \and
23 Thomas Breuel \and 29 Thomas Breuel \and
24 Youssouf Chherawala \and 30 Youssouf Chherawala \and
25 Moustapha Cisse \and 31 Moustapha Cisse \and
33 Salah Rifai \and 39 Salah Rifai \and
34 Francois Savard \and 40 Francois Savard \and
35 Guillaume Sicard 41 Guillaume Sicard
36 } 42 }
37 \date{September 30th, submission to MLJ special issue on learning from multi-label data} 43 \date{September 30th, submission to MLJ special issue on learning from multi-label data}
44 \journalname{Machine Learning Journal}
45 \institute{Frédéric Bastien \and \\
46 Yoshua Bengio \and \\
47 Arnaud Bergeron \and \\
48 Nicolas Boulanger-Lewandowski \and \\
49 Youssouf Chherawala \and \\
50 Moustapha Cisse \and \\
51 Myriam Côté \and \\
52 Dumitru Erhan \and \\
53 Jeremy Eustache \and \\
54 Xavier Glorot \and \\
55 Xavier Muller \and \\
56 Sylvain Pannetier-Lebeuf \and \\
57 Razvan Pascanu \and \\
58 Salah Rifai \and \\
59 Francois Savard \and \\
60 Guillaume Sicard \at
61 Dept. IRO, Universite de Montreal, C.P. 6128, Montreal, QC, H3C 3J7, Canada\\
62 \email{yoshua.bengio@umontreal.ca}
63 \and
64 Thomas Breuel \at
65 Department of Computer Science, University of Kaiserslautern, Postfach 3049, 67653 Kaiserslautern, Germany
66 }
67
38 68
39 \begin{document} 69 \begin{document}
40 70
41 %\makeanontitle 71 %\makeanontitle
42 \maketitle 72 \maketitle
44 %\vspace*{-2mm} 74 %\vspace*{-2mm}
45 \begin{abstract} 75 \begin{abstract}
46 Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition. 76 Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition.
47 \end{abstract} 77 \end{abstract}
48 %\vspace*{-3mm} 78 %\vspace*{-3mm}
49 79
50 Keywords: self-taught learning, multi-task learning, out-of-distribution examples, deep learning, handwriting recognition. 80 Keywords: self-taught learning, multi-task learning, out-of-distribution examples, deep learning, handwriting recognition.
51 81
52 \section{Introduction} 82 \section{Introduction}
53 %\vspace*{-1mm} 83 %\vspace*{-1mm}
54 84
55 {\bf Deep Learning} has emerged as a promising new area of research in 85 {\bf Deep Learning} has emerged as a promising new area of research in
56 statistical machine learning (see~\citet{Bengio-2009} for a review). 86 statistical machine learning (see \citet{Bengio-2009} for a review).
57 Learning algorithms for deep architectures are centered on the learning 87 Learning algorithms for deep architectures are centered on the learning
58 of useful representations of data, which are better suited to the task at hand, 88 of useful representations of data, which are better suited to the task at hand,
59 and are organized in a hierarchy with multiple levels. 89 and are organized in a hierarchy with multiple levels.
60 This is in part inspired by observations of the mammalian visual cortex, 90 This is in part inspired by observations of the mammalian visual cortex,
61 which consists of a chain of processing elements, each of which is associated with a 91 which consists of a chain of processing elements, each of which is associated with a
62 different representation of the raw visual input. In fact, 92 different representation of the raw visual input. In fact,
63 it was found recently that the features learnt in deep architectures resemble 93 it was found recently that the features learnt in deep architectures resemble
64 those observed in the first two of these stages (in areas V1 and V2 94 those observed in the first two of these stages (in areas V1 and V2
65 of visual cortex)~\citep{HonglakL2008}, and that they become more and 95 of visual cortex) \citep{HonglakL2008}, and that they become more and
66 more invariant to factors of variation (such as camera movement) in 96 more invariant to factors of variation (such as camera movement) in
67 higher layers~\citep{Goodfellow2009}. 97 higher layers~\citep{Goodfellow2009}.
68 Learning a hierarchy of features increases the 98 Learning a hierarchy of features increases the
69 ease and practicality of developing representations that are at once 99 ease and practicality of developing representations that are at once
70 tailored to specific tasks, yet are able to borrow statistical strength 100 tailored to specific tasks, yet are able to borrow statistical strength
1011 basins of attraction are not discovered by pure supervised learning 1041 basins of attraction are not discovered by pure supervised learning
1012 (with or without self-taught settings), and more labeled examples 1042 (with or without self-taught settings), and more labeled examples
1013 does not allow the model to go from the poorer basins of attraction discovered 1043 does not allow the model to go from the poorer basins of attraction discovered
1014 by the purely supervised shallow models to the kind of better basins associated 1044 by the purely supervised shallow models to the kind of better basins associated
1015 with deep learning and self-taught learning. 1045 with deep learning and self-taught learning.
1016 1046
1017 A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 1047 A Flash demo of the recognizer (where both the MLP and the SDA can be compared)
1018 can be executed on-line at {\tt http://deep.host22.com}. 1048 can be executed on-line at {\tt http://deep.host22.com}.
1019 1049
1020 1050
1021 \section*{Appendix I: Detailed Numerical Results} 1051 \section*{Appendix I: Detailed Numerical Results}
1097 \end{table} 1127 \end{table}
1098 1128
1099 %\afterpage{\clearpage} 1129 %\afterpage{\clearpage}
1100 \clearpage 1130 \clearpage
1101 { 1131 {
1132 \bibliographystyle{spbasic} % basic style, author-year citations
1102 \bibliography{strings,strings-short,strings-shorter,ift6266_ml,specials,aigaion-shorter} 1133 \bibliography{strings,strings-short,strings-shorter,ift6266_ml,specials,aigaion-shorter}
1103 %\bibliographystyle{plainnat} 1134 %\bibliographystyle{plainnat}
1104 \bibliographystyle{unsrtnat} 1135 %\bibliographystyle{unsrtnat}
1105 %\bibliographystyle{apalike} 1136 %\bibliographystyle{apalike}
1106 } 1137 }
1107 1138
1108 1139
1109 \end{document} 1140 \end{document}