comparison writeup/aigaion-shorter.bib @ 498:7ff00c27c976

add missing file for bibtex and make it smaller.
author Frederic Bastien <nouiz@nouiz.org>
date Tue, 01 Jun 2010 11:36:41 -0400
parents
children ae77edb9df67
comparison
equal deleted inserted replaced
497:109ebd3e67c0 498:7ff00c27c976
1 %Aigaion2 BibTeX export from LISA - Publications
2 %Tuesday 01 June 2010 10:46:52 AM
3 @INPROCEEDINGS{Attardi+al-2009,
4 author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
5 keywords = {classifier, dependency parsing, natural language, parser, perceptron},
6 title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron},
7 booktitle = {Proceeding of Evalita 2009},
8 series = {LNCS},
9 year = {2009},
10 publisher = {Springer},
11 abstract = {Abstract. DeSR is a statistical transition-based dependency parser which learns from annotated corpora which actions to perform for building parse trees while scanning a sentence. We describe recent improvements to the parser, in particular stacked parsing, exploiting a beam search strategy and using a Multilayer Perceptron classifier. For the Evalita 2009 Dependency Parsing task DesR was configured to use a combination of stacked parsers. The stacked combination achieved the best accuracy scores in both the main and pilot subtasks. The contribution to the result of various choices is analyzed, in particular for taking advantage of the peculiar features of the TUT Treebank.}
12 }
13
14 @INPROCEEDINGS{Bengio+al-2009,
15 author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
16 title = {Curriculum Learning},
17 year = {2009},
18 crossref = {ICML09-shorter},
19 abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved by using a particular curriculum, i.e., the selection and order of training examples. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
20 }
21
22 @TECHREPORT{Bengio+al-2009-TR,
23 author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
24 title = {Curriculum Learning},
25 number = {1330},
26 year = {2009},
27 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
28 abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
29 }
30
31 @MISC{Bengio+al-patent-2000,
32 author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and {LeCun}, Yann},
33 title = {Module for constructing trainable modular network in which each module outputs and inputs data structured as a graph},
34 year = {2000},
35 howpublished = {U.S. Patent 6,128,606, October 3}
36 }
37
38 @MISC{Bengio+al-patent-2001,
39 author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and G. Howard, Paul},
40 title = {Z-Coder : a fast adaptive binary arithmetic coder},
41 year = {2001},
42 howpublished = {U.S. Patent 6,188,334, February 13, 2001, along with patents 6,225,925, 6,281,817, and 6,476,740}
43 }
44
45 @MISC{Bengio+al-patent-94,
46 author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
47 title = {Visitor Registration System Using Automatic Handwriting Recognition},
48 year = {1994},
49 howpublished = {Patent submitted in the U.S.A. in October 1994, submission number 1-16-18-1}
50 }
51
52 @INCOLLECTION{Bengio+al-spectral-2006,
53 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
54 editor = {Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti},
55 title = {Spectral Dimensionality Reduction},
56 booktitle = {Feature Extraction, Foundations and Applications},
57 year = {2006},
58 publisher = {Springer},
59 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/eigenfn_chapter.pdf},
60 abstract = {In this chapter, we study and put under a common framework a number
61 of non-linear dimensionality reduction methods, such as Locally Linear Embedding,
62 Isomap, Laplacian eigenmaps and kernel {PCA}, which are based
63 on performing an eigen-decomposition (hence the name "spectral"). That
64 framework also includes classical methods such as {PCA} and metric multidimensional
65 scaling ({MDS}). It also includes the data transformation step used
66 in spectral clustering. We show that in all of these cases the learning algorithm
67 estimates the principal eigenfunctions of an operator that depends on
68 the unknown data density and on a kernel that is not necessarily positive
69 semi-definite. This helps to generalize some of these algorithms so as to predict
70 an embedding for out-of-sample examples without having to retrain the
71 model. It also makes it more transparent what these algorithm are minimizing
72 on the empirical data and gives a corresponding notion of generalization
73 error.},
74 cat={B},topics={HighDimensional,Kernel,Unsupervised},
75 }
76
77 @INCOLLECTION{Bengio+al-ssl-2006,
78 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
79 editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
80 title = {Label Propagation and Quadratic Criterion},
81 booktitle = {Semi-Supervised Learning},
82 year = {2006},
83 pages = {193--216},
84 publisher = {{MIT} Press},
85 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_ssl.pdf},
86 abstract = {Various graph-based algorithms for semi-supervised learning have been proposed in
87 the recent literature. They rely on the idea of building a graph whose nodes are
88 data points (labeled and unlabeled) and edges represent similarities between points.
89 Known labels are used to propagate information through the graph in order to label
90 all nodes. In this chapter, we show how these different algorithms can be cast into
91 a common framework where one minimizes a quadratic cost criterion whose closed-form solution is found by solving a linear system of size n (total number of data
92 points). The cost criterion naturally leads to an extension of such algorithms to
93 the inductive setting, where one obtains test samples one at a time: the derived
94 induction formula can be evaluated in O(n) time, which is much more efficient
95 than solving again exactly the linear system (which in general costs O(kn2) time
96 for a sparse graph where each data point has k neighbors). We also use this inductive
97 formula to show that when the similarity between points satisfies a locality property,
98 then the algorithms are plagued by the curse of dimensionality, with respect to the
99 dimensionality of an underlying manifold.},
100 cat={B},topics={Unsupervised},
101 }
102
103 @TECHREPORT{Bengio+al-treecurse-2007,
104 author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
105 title = {Decision Trees do not Generalize to New Variations},
106 number = {1304},
107 year = {2007},
108 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
109 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf}
110 }
111
112 @INPROCEEDINGS{Bengio+Bengio96,
113 author = {Bengio, Samy and Bengio, Yoshua},
114 editor = {Xu, L.},
115 title = {An {EM} Algorithm for Asynchronous Input/Output Hidden {M}arkov Models},
116 booktitle = {International Conference On Neural Information Processing},
117 year = {1996},
118 pages = {328--334},
119 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iconip96.pdf},
120 abstract = {In learning tasks in which input sequences are mapped to output sequences, it is often the case that the input and output sequences are not synchronous. For example, in speech recognition, acoustic sequences are longer than phoneme sequences. Input/Output Hidden {Markov} Models have already been proposed to represent the distribution of an output sequence given an input sequence of the same length. We extend here this model to the case of asynchronous sequences_ and show an Expectation-Maximization algorithm for training such models.},
121 topics={Markov},cat={C},
122 }
123
124 @INCOLLECTION{Bengio+chapter2007,
125 author = {Bengio, Yoshua and {LeCun}, Yann},
126 editor = {Bottou, {L{\'{e}}on} and Chapelle, Olivier and DeCoste, D. and Weston, J.},
127 title = {Scaling Learning Algorithms towards {AI}},
128 booktitle = {Large Scale Kernel Machines},
129 year = {2007},
130 publisher = {MIT Press},
131 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+lecun_chapter2007.pdf},
132 abstract = {One long-term goal of machine learning research is to produce methods that
133 are applicable to highly complex tasks, such as perception (vision, audition), reasoning,
134 intelligent control, and other artificially intelligent behaviors. We argue
135 that in order to progress toward this goal, the Machine Learning community must
136 endeavor to discover algorithms that can learn highly complex functions, with minimal
137 need for prior knowledge, and with minimal human intervention. We present
138 mathematical and empirical evidence suggesting that many popular approaches
139 to non-parametric learning, particularly kernel methods, are fundamentally limited
140 in their ability to learn complex high-dimensional functions. Our analysis
141 focuses on two problems. First, kernel machines are shallow architectures, in
142 which one large layer of simple template matchers is followed by a single layer
143 of trainable coefficients. We argue that shallow architectures can be very inefficient
144 in terms of required number of computational elements and examples. Second,
145 we analyze a limitation of kernel machines with a local kernel, linked to the
146 curse of dimensionality, that applies to supervised, unsupervised (manifold learning)
147 and semi-supervised kernel machines. Using empirical results on invariant
148 image recognition tasks, kernel methods are compared with deep architectures, in
149 which lower-level features or concepts are progressively combined into more abstract
150 and higher-level representations. We argue that deep architectures have the
151 potential to generalize in non-local ways, i.e., beyond immediate neighbors, and
152 that this is crucial in order to make progress on the kind of complex tasks required
153 for artificial intelligence.},
154 cat={B},topics={HighDimensional},
155 }
156
157 @ARTICLE{Bengio+Delalleau-2009,
158 author = {Bengio, Yoshua and Delalleau, Olivier},
159 title = {Justifying and Generalizing Contrastive Divergence},
160 journal = {Neural Computation},
161 volume = {21},
162 number = {6},
163 year = {2009},
164 pages = {1601--1621},
165 abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its residual term converges to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence (CD) estimator of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain. We present theoretical and empirical evidence linking the number of Gibbs steps $k$ and the magnitude of the RBM parameters to the bias in the CD estimator. These experiments also suggest that the sign of the CD estimator is correct most of the time, even when the bias is large, so that CD-$k$ is a good descent direction even for small $k$.}
166 }
167
168 @TECHREPORT{Bengio+Delalleau-TR2007,
169 author = {Bengio, Yoshua and Delalleau, Olivier},
170 keywords = {Contrastive Divergence, Restricted {Boltzmann} Machine},
171 title = {Justifying and Generalizing Contrastive Divergence},
172 number = {1311},
173 year = {2007},
174 institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
175 abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its terms converge to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence approximation of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain.}
176 }
177
178 @INPROCEEDINGS{Bengio+DeMori88,
179 author = {Bengio, Yoshua and De Mori, Renato},
180 title = {Use of neural networks for the recognition of place of articulation},
181 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
182 year = {1988},
183 pages = {103--106},
184 topics={Speech},cat={C},
185 }
186
187 @INPROCEEDINGS{Bengio+DeMori89,
188 author = {Bengio, Yoshua and Cardin, Regis and Cosi, Piero and De Mori, Renato},
189 title = {Speech coding with multi-layer networks},
190 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
191 year = {1989},
192 pages = {164--167},
193 topics={Speech},cat={C},
194 }
195
196 @INCOLLECTION{Bengio+DeMori90a,
197 author = {Bengio, Yoshua and De Mori, Renato},
198 editor = {Sethi, I. K. and Jain, A. K.},
199 title = {Connectionist models and their application to automatic speech recognition},
200 booktitle = {Artificial Neural Networks and Statistical Pattern Recognition: Old and New Connections},
201 year = {1990},
202 pages = {175--192},
203 publisher = {Elsevier, Machine Intelligence and Pattern Recognition Series},
204 topics={Speech},cat={B},
205 }
206
207 @ARTICLE{Bengio+Frasconi-jair95,
208 author = {Bengio, Yoshua and Frasconi, Paolo},
209 title = {Diffusion of Context and Credit Information in {M}arkovian Models},
210 journal = {Journal of Artificial Intelligence Research},
211 volume = {3},
212 year = {1995},
213 pages = {249--270},
214 abstract = {This paper studies the problem of ergodicity of transition probability matrices in {Markovian} models, such as hidden {Markov} models ({HMM}s), and how it makes very difficult the task of learning to represent long-term context for sequential data. This phenomenon hurts the forward propagation of long-term context information, as well as learning a hidden state representation to represent long-term context, which depends on propagating credit information backwards in time. Using results from {Markov} chain theory, we show that this problem of diffusion of context and credit is reduced when the transition probabilities approach 0 or 1, i.e., the transition probability matrices are sparse and the model essentially deterministic. The results found in this paper apply to learning approaches based on continuous optimization, such as gradient descent and the Baum-Welch algorithm.},
215 topics={Markov,LongTerm},cat={J},
216 }
217
218 @INPROCEEDINGS{Bengio+Frasconi-nips7-diffuse,
219 author = {Bengio, Yoshua and Frasconi, Paolo},
220 title = {Diffusion of Credit in {M}arkovian Models},
221 year = {1995},
222 pages = {553--560},
223 crossref = {NIPS7-shorter},
224 abstract = {This paper studies the problem of diffusion in {Markovian} models, such as hidden {Markov} models ({HMM}s) and how it makes very difficult the task of learning of long-term dependencies in sequences. Using results from {Markov} chain theory, we show that the problem of diffusion is reduced if the transition probabilities approach 0 or 1. Under this condition, standard {HMM}s have very limited modeling capabilities, but input/output {HMM}s can still perform interesting computations.},
225 topics={Markov},cat={C},
226 }
227
228 @INPROCEEDINGS{Bengio+Frasconi-nips7-iohmms,
229 author = {Bengio, Yoshua and Frasconi, Paolo},
230 title = {An Input/Output {HMM} Architecture},
231 year = {1995},
232 pages = {427--434},
233 crossref = {NIPS7-shorter},
234 abstract = {We introduce a recurrent architecture having a modular structure and we formulate a training procedure based on the {EM} algorithm. The resulting model has similarities to hidden {Markov} models, but supports recurrent networks processing style and allows to exploit the supervised learning paradigm while using maximum likelihood estimation.},
235 topics={Markov},cat={C},
236 }
237
238 @INPROCEEDINGS{Bengio+Frasconi-nips94,
239 author = {Bengio, Yoshua and Frasconi, Paolo},
240 title = {Credit Assignment through Time: Alternatives to Backpropagation},
241 year = {1994},
242 pages = {75--82},
243 crossref = {NIPS6-shorter},
244 abstract = {Learning to recognize or predict sequences using long-term context has many applications. However, practical and theoretical problems are found in training recurrent neural networks to perform tasks in which input/output dependencies span long intervals. Starting from a mathematical analysis of the problem, we consider and compare alternative algorithms and architectures on tasks for which the span of the input/output dependencies can be controlled. Results on the new algorithms show performance qualitatively superior to that obtained with backpropagation.},
245 topics={LongTerm},cat={C},
246 }
247
248 @ARTICLE{Bengio+Pouliot90,
249 author = {Bengio, Yoshua and Pouliot, Yannick},
250 title = {Efficient recognition of immunoglobulin domains from amino-acid sequences using a neural network},
251 journal = {Computer Applications in the Biosciences},
252 volume = {6},
253 number = {2},
254 year = {1990},
255 pages = {319--324},
256 topics={Bioinformatic,PriorKnowledge},cat={J},
257 }
258
259 @INPROCEEDINGS{Bengio+Senecal-2003,
260 author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
261 title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
262 booktitle = {Proceedings of the conference on Artificial Intelligence and Statistics (AISTATS)},
263 year = {2003},
264 abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we propose and evaluate sampling-based methods which require network passes only for the observed "positive example'' and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.}
265 }
266
267 @ARTICLE{Bengio+Senecal-2008,
268 author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
269 keywords = {Energy-based models, fast training, importance sampling, language modeling, Monte Carlo methods, probabilistic neural networks},
270 title = {Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model},
271 journal = {IEEE Transactions on Neural Networks},
272 volume = {19},
273 number = {4},
274 year = {2008},
275 pages = {713--722},
276 abstract = {Previous work on statistical language modeling has shown that it is possible to train a feedforward neural network to approximate probabilities over sequences of words, resulting in significant error reduction when compared to standard baseline models based on -grams. However, training the neural network model with the maximum-likelihood criterion requires computations proportional to the number of words in the vocabulary. In this paper, we introduce adaptive importance sampling as a way to accelerate training of the model. The idea is to use an adaptive n-gram model to track the conditional distributions produced by the neural network. We show that a very significant speedup can be obtained on standard problems.}
277 }
278
279 @INCOLLECTION{Bengio-2007,
280 author = {Bengio, Yoshua},
281 editor = {Cisek, Paul and Kalaska, John and Drew, Trevor},
282 title = {On the Challenge of Learning Complex Functions},
283 booktitle = {Computational Neuroscience: Theoretical Insights into Brain Function},
284 series = {Progress in Brain Research},
285 year = {2007},
286 publisher = {Elsevier},
287 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/PBR_chapter.pdf},
288 abstract = {A common goal of computational neuroscience and of artificial intelligence
289 research based on statistical learning algorithms is the discovery and
290 understanding of computational principles that could explain what we
291 consider adaptive intelligence, in animals as well as in machines. This
292 chapter focuses on what is required for the learning of complex behaviors. We
293 believe it involves the learning of highly varying functions, in a
294 mathematical sense. We bring forward two types of arguments which convey
295 the message that many currently popular machine learning approaches to
296 learning flexible functions have fundamental limitations that render them
297 inappropriate for learning highly varying functions. The first issue
298 concerns the representation of such functions with what we call shallow model
299 architectures. We discuss limitations of shallow architectures, such as
300 so-called kernel machines, boosting algorithms, and one-hidden-layer artificial neural
301 networks. The second issue is more focused and concerns kernel machines
302 with a local kernel (the type used most often in practice),
303 that act like a collection of template matching units. We present
304 mathematical results on such computational architectures showing that they
305 have a limitation similar to those already proved for older non-parametric
306 methods, and connected to the so-called curse of dimensionality. Though it has long
307 been believed that efficient learning in deep architectures is difficult,
308 recently proposed computational principles for learning in deep architectures
309 may offer a breakthrough.}
310 }
311
312 @ARTICLE{Bengio-2009,
313 author = {Bengio, Yoshua},
314 title = {Learning deep architectures for {AI}},
315 journal = {Foundations and Trends in Machine Learning},
316 volume = {2},
317 number = {1},
318 year = {2009},
319 pages = {1--127},
320 note = {Also published as a book. Now Publishers, 2009.},
321 abstract = {Theoretical results suggest that in order to learn the kind of
322 complicated functions that can represent high-level abstractions (e.g. in
323 vision, language, and other AI-level tasks), one may need {\insist deep
324 architectures}. Deep architectures are composed of multiple levels of non-linear
325 operations, such as in neural nets with many hidden layers or in complicated
326 propositional formulae re-using many sub-formulae. Searching the
327 parameter space of deep architectures is a difficult task, but
328 learning algorithms such as those for Deep Belief Networks have recently been proposed
329 to tackle this problem with notable success, beating the state-of-the-art
330 in certain areas. This paper discusses the motivations and principles regarding
331 learning algorithms for deep architectures, in particular those exploiting as
332 building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines,
333 used to construct deeper models such as Deep Belief Networks.}
334 }
335
336 @TECHREPORT{Bengio-96-TR,
337 author = {Bengio, Yoshua},
338 title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
339 number = {\#1019},
340 year = {1996},
341 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
342 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengioy_TR1019.pdf},
343 abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a rediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest. Experiments were performed on portfolio selection with 35 Canadian stocks.},
344 topics={Finance,Discriminant},cat={T},
345 }
346
347 @BOOK{bengio-book96,
348 author = {Bengio, Yoshua},
349 title = {Neural Networks for Speech and Sequence Recognition},
350 year = {1996},
351 publisher = {International Thompson Computer Press},
352 topics={Speech},cat={B},
353 }
354
355 @TECHREPORT{Bengio-convex-05,
356 author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
357 title = {Convex neural networks},
358 number = {1263},
359 year = {2005},
360 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
361 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1263.pdf},
362 abstract = {Convexity has recently received a lot of attention in the machine learning community, and the lack of convexity has been seen as a major disadvantage of many learning algorithms, such as multi-layer artificial neural networks. We how that training multi-layer neural networks in which the number of hidden units is learned can be viewed as a convex optimization problem. This problem involves an infinite number of variables, but can be solved by incrementally inserting a hidden unit at a time, each time finding a linear classifiers that minimizes a weighted sum of errors.},
363 topics={Boosting},cat={T},
364 }
365
366 @ARTICLE{Bengio-decision-trees10,
367 author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
368 title = {Decision Trees do not Generalize to New Variations},
369 journal = {Computational Intelligence},
370 year = {2010},
371 note = {To appear}
372 }
373
374 @ARTICLE{bengio-demori89,
375 author = {Bengio, Yoshua and De Mori, Renato},
376 title = {Use of multilayer networks for the recognition of phonetic features and phonemes},
377 journal = {Computational Intelligence},
378 volume = {5},
379 year = {1989},
380 pages = {134--141},
381 topics={Speech},cat={J},
382 }
383
384 @ARTICLE{Bengio-eigen-NC2004,
385 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
386 title = {Learning eigenfunctions links spectral embedding and kernel {PCA}},
387 journal = {Neural Computation},
388 volume = {16},
389 number = {10},
390 year = {2004},
391 pages = {2197--2219},
392 abstract = {In this paper, we show a direct relation between spectral embedding methods and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of an operator defined from a kernel and the unknown data generating density. Whereas spectral embedding methods only provided coordinates for the training points, the analysis justifies a simple extension to out-of-sample examples (the Nystr{\"{o}}m formula) for Multi-Dimensional Scaling, spectral clustering, Laplacian eigenmaps, Locally Linear Embedding ({LLE}) and Isomap. The analysis provides, for all such spectral embedding methods, the definition of a loss function, whose empirical average is minimized by the traditional algorithms. The asymptotic expected value of that loss defines a generalization performance and clarifies what these algorithms are trying to learn. Experiments with {LLE}, Isomap, spectral clustering and {MDS} show that this out-of-sample embedding formula generalizes well, with a level of error comparable to the effect of small perturbations of the training set on the embedding.},
393 topics={HighDimensional,Kernel,Unsupervised},cat={J},
394 }
395
396 @INPROCEEDINGS{Bengio-Gingras-nips8,
397 author = {Bengio, Yoshua and Gingras, Fran{\c c}ois},
398 title = {Recurrent Neural Networks for Missing or Asynchronous Data},
399 year = {1996},
400 pages = {395--401},
401 crossref = {NIPS8-shorter},
402 abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more discriminant approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},
403 topics={Finance,Missing},cat={C},
404 }
405
406 @ARTICLE{Bengio-Grandvalet-JMLR-04,
407 author = {Bengio, Yoshua and Grandvalet, Yves},
408 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
409 volume = {5},
410 year = {2004},
411 pages = {1089--1105},
412 crossref = {JMLR-shorter},
413 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
414 topics={Comparative},cat={J},
415 }
416
417 @TECHREPORT{bengio-hyper-TR99,
418 author = {Bengio, Yoshua},
419 title = {Continuous Optimization of Hyper-Parameters},
420 number = {1144},
421 year = {1999},
422 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
423 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyperTR.pdf},
424 abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves (1) “training errors” on each training example and (2) some hyper-parameters, which are kept fixed during this minimization. When there is only a single hyper-parameter one can easily explore how its value aects a model selection criterion (that is not the same as the training criterion, and is used to select hyper-parameters). In this paper we present a methodology to select many hyper-parameters that is based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. We first consider the case of a training criterion that is quadratic in the parameters. In that case, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient, but this formula requires the computation of second derivatives of the training criterion},
425 topics={ModelSelection},cat={T},
426 }
427
428 @INPROCEEDINGS{Bengio-icnn93,
429 author = {Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice},
430 title = {The problem of learning long-term dependencies in recurrent networks},
431 booktitle = {IEEE International Conference on Neural Networks},
432 year = {1993},
433 pages = {1183--1195},
434 publisher = {IEEE Press},
435 note = {(invited paper)},
436 topics={LongTerm},cat={C},
437 }
438
439 @ARTICLE{Bengio-ijprai93,
440 author = {Bengio, Yoshua},
441 title = {A Connectionist Approach to Speech Recognition},
442 journal = {International Journal on Pattern Recognition and Artificial Intelligence},
443 volume = {7},
444 number = {4},
445 year = {1993},
446 pages = {647--668},
447 abstract = {The task discussed in this paper is that of learning to map input sequences to output sequences. In particular, problems of phoneme recognition in continuous speech are considered, but most of the discussed techniques could be applied to other tasks, such as the recognition of sequences of handwritten characters. The systems considered in this paper are based on connectionist models, or artificial neural networks, sometimes combined with statistical techniques for recognition of sequences of patterns, stressing the integration of prior knowledge and learning. Different architectures for sequence and speech recognition are reviewed, including recurrent networks as well as hybrid systems involving hidden {Markov} models.},
448 topics={PriorKnowledge,Speech},cat={J},
449 }
450
451 @TECHREPORT{Bengio-iohmms-TR99,
452 author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
453 title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
454 number = {1146},
455 year = {1999},
456 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
457 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iohmms-returnsTR.pdf},
458 abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditionned on an input sequence. For example, these conditional distributions can be linear, logistic, or non-linear (using for example multi-layer neural networks). We compare the generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments are performed on modeling the returns of market and sector indices. Note that the unconditional Gaussian estimates the first moment with the historical average. The results show that, although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s yielded significantly better performance, as measured by the out-of-sample likelihood.},
459 topics={Markov},cat={T},
460 }
461
462 @ARTICLE{bengio-lauzon-ducharme:2000,
463 author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
464 title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
465 journal = {IEEE Transaction on Neural Networks},
466 volume = {12},
467 number = {1},
468 year = {2001},
469 pages = {113--123},
470 abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditioned on an input sequence. For example, these conditional distributions can be logistic, or non-linear (using for example multi-layer neural networks). We compare generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments compare these models on predicting the conditional density of returns of market sector indices. Note that the unconditional Gaussian estimates the first moment the historical average. The results show that_ although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s significantly better performance, as estimated by the out-of-sample likelihood.},
471 topics={Markov,Finance},cat={J},
472 }
473
474 @INPROCEEDINGS{bengio-lecun-94,
475 author = {Bengio, Yoshua and {LeCun}, Yann},
476 title = {Word normalization for on-line handwritten word recognition},
477 booktitle = {Proc. of the International Conference on Pattern Recognition},
478 volume = {II},
479 year = {1994},
480 pages = {409--413},
481 publisher = {IEEE},
482 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-norm.ps},
483 abstract = {We introduce a new approach to normalizing words written with an electronic stylus that applies to all styles of handwriting (upper case, lower case, printed, cursive, or mixed). A geometrical model of the word spatial structure is fitted to the pen trajectory using the {EM} algorithm. The fitting process maximizes the likelihood of the trajectory given the model and a set a priors on its parameters. The method was evaluated and integrated to a recognition system that combines neural networks and hidden {Markov} models.},
484 topics={PriorKnowledge,Speech},cat={C},
485 }
486
487 @TECHREPORT{Bengio-localfailure-TR-2005,
488 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
489 title = {The Curse of Dimensionality for Local Kernel Machines},
490 number = {1258},
491 year = {2005},
492 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
493 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1258.pdf},
494 abstract = {We present a series of theoretical arguments supporting the claim that a large class of modern learning algorithms based on local kernels are sensitive to the curse of dimensionality. These include local manifold learning algorithms such as Isomap and {LLE}, support vector classifiers with Gaussian or other local kernels, and graph-based semisupervised learning algorithms using a local similarity function. These algorithms are shown to be local in the sense that crucial properties of the learned function at x depend mostly on the neighbors of x in the training set. This makes them sensitive to the curse of dimensionality, well studied for classical non-parametric statistical learning. There
495 is a large class of data distributions for which non-local solutions could be expressed compactly and potentially be learned with few examples, but which will require a large number of local bases and therefore a large number of training examples when using a local learning algorithm.},
496 topics={HighDimensional,Kernel,Unsupervised},cat={T},
497 }
498
499 @INPROCEEDINGS{Bengio-nips-2006,
500 author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
501 title = {Greedy Layer-Wise Training of Deep Networks},
502 year = {2007},
503 pages = {153--160},
504 crossref = {NIPS19-shorter},
505 abstract = {Complexity theory of circuits strongly suggests that deep architectures can be
506 much more efficient (sometimes exponentially) than shallow architectures,
507 in terms of computational elements required to represent some functions.
508 Deep multi-layer neural networks have many levels of non-linearities
509 allowing them to compactly represent highly non-linear and
510 highly-varying functions. However, until recently it was not clear how
511 to train such deep networks, since gradient-based
512 optimization starting from random initialization appears to often get stuck
513 in poor solutions. Hinton et al. recently introduced
514 a greedy layer-wise unsupervised learning algorithm for Deep Belief
515 Networks (DBN), a generative model with many layers of hidden causal
516 variables. In the context of the above optimization problem,
517 we study this algorithm empirically and explore variants to
518 better understand its success and extend it to cases where the inputs are
519 continuous or where the structure of the input distribution is not
520 revealing enough about the variable to be predicted in a supervised task.
521 Our experiments also confirm the hypothesis that the greedy
522 layer-wise unsupervised training strategy mostly helps the
523 optimization, by initializing weights in a region near a
524 good local minimum, giving rise to internal distributed representations
525 that are high-level abstractions of the input, bringing better generalization.}
526 }
527
528 @INPROCEEDINGS{Bengio-nips10,
529 author = {Bengio, Yoshua and Bengio, Samy and Isabelle, Jean-Fran{\c c}ois and Singer, Yoram},
530 title = {Shared Context Probabilistic Transducers},
531 year = {1998},
532 crossref = {NIPS10-shorter},
533 abstract = {Recently, a model for supervised learning of probabilistic transducers represented by suffix trees was introduced. However, this algorithm tends to build very large trees, requiring very large amounts of computer memory. In this paper, we propose a new, more compact, transducer model in which one shares the parameters of distributions associated to contexts yielding similar conditional output distributions. We illustrate the advantages of the proposed algorithm with comparative experiments on inducing a noun phrase recognizer.},
534 topics={HighDimensional},cat={C},
535 }
536
537 @TECHREPORT{Bengio-NLMP-TR-2005,
538 author = {Bengio, Yoshua and Larochelle, Hugo},
539 title = {Non-Local Manifold Parzen Windows},
540 number = {1264},
541 year = {2005},
542 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
543 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/NLMP-techreport.pdf},
544 abstract = {In order to escape from the curse of dimensionality, we claim that one can learn non-local functions, in the sense that the value and shape of the learned function at x must be inferred using examples that may be far from x. With this objective, we present a non-local non-parametric density estimator. It builds upon previously proposed Gaussian mixture models with regularized covariance matrices to take into account the local shape of the manifold. It also builds upon recent work on non-local estimators of the tangent plane of a manifold, which are able to generalize in places with little training data, unlike traditional, local, non-parametric models.},
545 topics={HighDimensional,Kernel,Unsupervised},cat={T},
546 }
547
548 @INPROCEEDINGS{Bengio-nncm96,
549 author = {Bengio, Yoshua},
550 editor = {Weigend, A.S. and Abu-Mostafa, Y.S. and Refenes, A. -P. N.},
551 title = {Training A Neural Network with a Financial Criterion Rather than a Prediction Criterion},
552 booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets ({NNCM}-96)},
553 year = {1997},
554 pages = {433--443},
555 publisher = {World Scientific},
556 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nncm.pdf},
557 abstract = {A common approach to quantitative decision taking with financial time-series is to train a model using a prediction criterion (e.g., squared error). We find on a portfolio selection problem that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest, with a differentiable decision module.},
558 topics={Finance,PriorKnowledge,Discriminant},cat={C},
559 }
560
561 @TECHREPORT{Bengio-NonStat-Hyper-TR,
562 author = {Bengio, Yoshua and Dugas, Charles},
563 title = {Learning Simple Non-Stationarities with Hyper-Parameters},
564 number = {1145},
565 year = {1999},
566 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
567 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nonstatTR.pdf},
568 abstract = {We consider sequential data that is sampled from an unknown process, so that the data are not necessarily i.i.d.. Most approaches to machine learning assume that data points are i.i.d.. Instead we consider a measure of generalization that does not make this assumption, and we consider in this context a recently proposed approach to optimizing hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to hyper-parameters. Here we use hyper-parameters that control a function that gives different weights to different time steps in the historical data sequence. The approach is successfully applied to modeling thev olatility of stock returns one month ahead. Comparative experiments with more traditional methods are presented.},
569 topics={ModelSelection,Finance},cat={T},
570 }
571
572 @ARTICLE{Bengio-scholarpedia-2007,
573 author = {Bengio, Yoshua},
574 title = {Neural net language models},
575 journal = {Scholarpedia},
576 volume = {3},
577 number = {1},
578 year = {2008},
579 pages = {3881},
580 abstract = {A language model is a function, or an algorithm for learning such a function, that captures the salient statistical characteristics of the distribution of sequences of words in a natural language, typically allowing one to make probabilistic predictions of the next word given preceding ones.
581
582 A neural network language model is a language model based on Neural Networks , exploiting their ability to learn distributed representations to reduce the impact of the curse of dimensionality.
583
584 In the context of learning algorithms, the curse of dimensionality refers to the need for huge numbers of training examples when learning highly complex functions. When the number of input variables increases, the number of required examples can grow exponentially. The curse of dimensionality arises when a huge number of different combinations of values of the input variables must be discriminated from each other, and the learning algorithm needs at least one example per relevant combination of values. In the context of language models, the problem comes from the huge number of possible sequences of words, e.g., with a sequence of 10 words taken from a vocabulary of 100,000 there are 10^{50} possible sequences...
585
586 A distributed representation of a symbol is a tuple (or vector) of features which characterize the meaning of the symbol, and are not mutually exclusive. If a human were to choose the features of a word, he might pick grammatical features like gender or plurality, as well as semantic features like animate" or invisible. With a neural network language model, one relies on the learning algorithm to discover these features, and the features are continuous-valued (making the optimization problem involved in learning much simpler).
587
588 The basic idea is to learn to associate each word in the dictionary with a continuous-valued vector representation. Each word corresponds to a point in a feature space. One can imagine that each dimension of that space corresponds to a semantic or grammatical characteristic of words. The hope is that functionally similar words get to be closer to each other in that space, at least along some directions. A sequence of words can thus be transformed into a sequence of these learned feature vectors. The neural network learns to map that sequence of feature vectors to a prediction of interest, such as the probability distribution over the next word in the sequence. What pushes the learned word features to correspond to a form of semantic and grammatical similarity is that when two words are functionally similar, they can be replaced by one another in the same context, helping the neural network to compactly represent a function that makes good predictions on the training set, the set of word sequences used to train the model.
589
590 The advantage of this distributed representation approach is that it allows the model to generalize well to sequences that are not in the set of training word sequences, but that are similar in terms of their features, i.e., their distributed representation. Because neural networks tend to map nearby inputs to nearby outputs, the predictions corresponding to word sequences with similar features are mapped to similar predictions. Because many different combinations of feature values are possible, a very large set of possible meanings can be represented compactly, allowing a model with a comparatively small number of parameters to fit a large training set.}
591 }
592
593 @TECHREPORT{Bengio-TR1312,
594 author = {Bengio, Yoshua},
595 title = {Learning deep architectures for AI},
596 number = {1312},
597 year = {2007},
598 institution = {Dept. IRO, Universite de Montreal},
599 note = {Preliminary version of journal article with the same title appearing in Foundations and Trends in Machine Learning (2009)},
600 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1312.pdf},
601 abstract = {Theoretical results strongly suggest that in order to learn the kind of
602 complicated functions that can represent high-level abstractions (e.g. in
603 vision, language, and other AI-level tasks), one may need deep
604 architectures. Deep architectures are composed of multiple levels of non-linear
605 operations, such as in neural nets with many hidden layers. Searching the
606 parameter space of deep architectures is a difficult optimization task, but
607 learning algorithms such as those for Deep Belief Networks have recently been proposed
608 to tackle this problem with notable success, beating the state-of-the-art
609 in certain areas. This paper discusses the motivations and principles regarding
610 learning algorithms for deep architectures and in particular for those based
611 on unsupervised learning such as Deep Belief Networks, using as building
612 blocks single-layer models such as Restricted {Boltzmann} Machines.}
613 }
614
615 @ARTICLE{Bengio-trnn94,
616 author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
617 title = {Learning Long-Term Dependencies with Gradient Descent is Difficult},
618 journal = {IEEE Transactions on Neural Networks},
619 volume = {5},
620 number = {2},
621 year = {1994},
622 pages = {157--166},
623 abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
624 optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J},
625 }
626
627 @INPROCEEDINGS{Bengio-wirn93,
628 author = {Bengio, Yoshua and Frasconi, Paolo and Gori, Marco and Soda, G.},
629 editor = {Caianello, E.},
630 title = {Recurrent Neural Networks for Adaptive Temporal Processing},
631 booktitle = {Proc. of the 6th Italian Workshop on Neural Networks, WIRN-93},
632 year = {1993},
633 pages = {1183--1195},
634 publisher = {World Scientific Publ.},
635 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rnn_review93.ps},
636 topics={LongTerm},cat={C},
637 }
638
639 @ARTICLE{Bengio2000c,
640 author = {Bengio, Yoshua},
641 title = {Gradient-Based Optimization of Hyperparameters},
642 journal = {Neural Computation},
643 volume = {12},
644 number = {8},
645 year = {2000},
646 pages = {1889--1900},
647 abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
648 topics={ModelSelection},cat={J},
649 }
650
651 @ARTICLE{Bengio89a,
652 author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Merlo, Ettore},
653 title = {Programmable execution of multi-layered networks for automatic speech recognition},
654 journal = {Communications of the Association for Computing Machinery},
655 volume = {32},
656 number = {2},
657 year = {1989},
658 pages = {195--199},
659 topics={Speech},cat={J},
660 }
661
662 @INPROCEEDINGS{Bengio89c,
663 author = {Bengio, Yoshua and Cosi, Piero and Cardin, Regis and De Mori, Renato},
664 title = {Use of multi-layered networks for coding speech with phonetic features},
665 year = {1989},
666 pages = {224--231},
667 crossref = {NIPS1-shorter},
668 abstract = {Preliminary results on speaker-independant speech recognition are reported. A method that combines expertise on neural networks with expertise on speech recognition is used to build the recognition systems. For transient sounds, event-driven property extractors with variable resolution in the time and frequency domains are used. For sonorant speech, a model of the human auditory system is preferred to FFT as a front-end module.},
669 topics={Speech},cat={C},
670 }
671
672 @INPROCEEDINGS{Bengio89d,
673 author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
674 title = {On the generalization capability of multilayered networks in the extraction of speech properties},
675 booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence},
676 year = {1989},
677 pages = {1531--1536},
678 publisher = {IEEE},
679 topics={Speech},cat={C},
680 }
681
682 @INPROCEEDINGS{Bengio90,
683 author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato},
684 title = {Speaker Independent Speech Recognition with Neural Networks and Speech Knowledge},
685 year = {1990},
686 pages = {218--225},
687 crossref = {NIPS2-shorter},
688 abstract = {We attempt to combine neural networks with knowledge from speech science to build a speaker independent speech recognition system. This knowledge is utilized in designing the preprocessing, input coding, output coding, output supervision and architectural constraints. To handle the temporal aspect of speech we combine delays, copies of activations of hidden and output units at the input level, and Back-Propagation for Sequences (BPS), a learning algorithm for networks with local self-loops. This strategy is demonstrated in several experiments, in particular a nasal discrimination task for which the application of a speech theory hypothesis dramatically improved generalization.},
689 topics={PriorKnowledge,Speech},cat={C},
690 }
691
692 @INCOLLECTION{Bengio90b,
693 author = {Bengio, Yoshua},
694 title = {Radial Basis Functions for speech recognition},
695 booktitle = {Speech Recognition and Understanding: Recent Advances, Trends and Applications},
696 year = {1990},
697 pages = {293--298},
698 publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
699 topics={Kernel,Speech},cat={B},
700 }
701
702 @INCOLLECTION{Bengio90c,
703 author = {Bengio, Yoshua and De Mori, Renato},
704 editor = {{Fogelman Soulie}, F. and Herault, J.},
705 title = {Speech coding with multilayer networks},
706 booktitle = {Neurocomputing: Algorithms, Architectures and Applications},
707 year = {1990},
708 pages = {207--216},
709 publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
710 topics={Speech},cat={B},
711 }
712
713 @INPROCEEDINGS{Bengio90e,
714 author = {Bengio, Yoshua and Pouliot, Yannick and Bengio, Samy and Agin, Patrick},
715 title = {A neural network to detect homologies in proteins},
716 year = {1990},
717 pages = {423--430},
718 crossref = {NIPS2-shorter},
719 abstract = {In order to detect the presence and location of immunoglobulin (Ig) domains from amino acid sequences we built a system based on a neural network with one hidden layer trained with back propagation. The program was designed to efficiently identify proteins exhibiting such domains, characterized by a few localized conserved regions and a low overall homology. When the National Biomedical Research Foundation (NBRF) NEW protein sequence database was scanned to evaluate the program's performance, we obtained very low rates of false negatives coupled with a moderate rate of false positives.},
720 topics={Bioinformatic,PriorKnowledge},cat={C},
721 }
722
723 @INPROCEEDINGS{Bengio90z,
724 author = {Bengio, Yoshua and De Mori, Renato and Gori, Marco},
725 editor = {Caianello, E.},
726 title = {Experiments on automatic speech recognition using BPS},
727 booktitle = {Parallel Architectures and Neural Networks},
728 year = {1990},
729 pages = {223--232},
730 publisher = {World Scientific Publ.},
731 topics={Speech},cat={C},
732 }
733
734 @INPROCEEDINGS{Bengio91a,
735 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
736 title = {A comparative study of hybrid acoustic phonetic decoders based on artificial neural networks},
737 booktitle = {Proceedings of EuroSpeech'91},
738 year = {1991},
739 topics={PriorKnowledge,Speech},cat={C},
740 }
741
742 @INPROCEEDINGS{Bengio91b,
743 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
744 title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
745 booktitle = {Proceedings of EuroSpeech'91},
746 year = {1991},
747 topics={Markov},cat={C},
748 }
749
750 @INPROCEEDINGS{Bengio91z,
751 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
752 title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
753 booktitle = {Proceedings of EuroSpeech'91},
754 year = {1991},
755 cat={C},
756 }
757
758 @ARTICLE{Bengio92b,
759 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
760 title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
761 journal = {Speech Communication},
762 volume = {11},
763 number = {2--3},
764 year = {1992},
765 pages = {261--271},
766 note = {Special issue on neurospeech},
767 topics={PriorKnowledge,Speech},cat={J},
768 }
769
770 @INPROCEEDINGS{Bengio92c,
771 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
772 title = {Neural Network - Gaussian Mixture Hybrid for Speech Recognition or Density Estimation},
773 year = {1992},
774 pages = {175--182},
775 crossref = {NIPS4-shorter},
776 abstract = {The subject of this paper is the integration of multi-layered Artificial Neural Networks ({ANN}) with probability density functions such as Gaussian mixtures found in continuous density hlidden {Markov} Models ({HMM}). In the first part of this paper we present an {ANN}/HMM hybrid in which all the parameters or the the system are simultaneously optimized with respect to a single criterion. In the second part of this paper, we study the relationship between the density of the inputs of the network and the density of the outputs of the networks. A rew experiments are presented to explore how to perform density estimation with {ANN}s.},
777 topics={Speech},cat={C},
778 }
779
780 @INPROCEEDINGS{Bengio94d,
781 author = {Frasconi, Paolo and Bengio, Yoshua},
782 title = {An {EM} Approach to Grammatical Inference: Input/Output {HMMs}},
783 booktitle = {International Conference on Pattern Recognition (ICPR'94)},
784 year = {1994},
785 pages = {289--294},
786 topics={Markov,LongTerm},cat={C},
787 }
788
789 @ARTICLE{Bengio96,
790 author = {Bengio, Yoshua and Frasconi, Paolo},
791 title = {Input/{O}utput {HMM}s for Sequence Processing},
792 journal = {IEEE Transactions on Neural Networks},
793 volume = {7},
794 number = {5},
795 year = {1996},
796 pages = {1231--1249},
797 abstract = {We consider problems of sequence processing and propose a solution based on a discrete state model in order to represent past context. We introduce a recurrent connectionist architecture having a modular structure that associates a subnetwork to each state. The model has a statistical interpretation we call Input/Output Hidden {Markov} Model ({IOHMM}). It can be trained by the {EM} or {GEM} algorithms, considering state trajectories as missing data, which decouples temporal credit assignment and actual parameter estimation.
798 The model presents similarities to hidden {Markov} models ({HMM}s), but allows us to map input sequences to output sequences, using the same processing style as recurrent neural networks. {IOHMM}s are trained using a more discriminant learning paradigm than {HMM}s, while potentially taking advantage of the {EM} algorithm.
799 We demonstrate that {IOHMM}s are well suited for solving grammatical inference problems on a benchmark problem. Experimental results are presented for the seven Tomita grammars, showing that these adaptive models can attain excellent generalization.},
800 topics={Markov},cat={J},
801 }
802
803 @TECHREPORT{Bengio96-hmmsTR,
804 author = {Bengio, Yoshua},
805 title = {Markovian Models for Sequential Data},
806 number = {1049},
807 year = {1996},
808 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
809 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hmmsTR.pdf},
810 abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many applications, especially for speech recognition. We first summarize the basics of {HMM}s, and then review several recent related learning algorithms and extensions of {HMM}s, including hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s, weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this area.},
811 topics={Markov},cat={T},
812 }
813
814 @ARTICLE{Bengio97,
815 author = {Bengio, Yoshua},
816 title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
817 journal = {International Journal of Neural Systems},
818 volume = {8},
819 number = {4},
820 year = {1997},
821 pages = {433--443},
822 note = {Special issue on noisy time-series},
823 abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a prediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to maximize the financial criterion of interest, here gains and losses (including those due to transactions) incurred during trading. Experiments were performed on portfolio selection with 35 Canadian stocks.},
824 topics={Finance,PriorKnowledge,Discriminant},cat={J},
825 }
826
827 @ARTICLE{Bengio99a,
828 author = {Bengio, Yoshua},
829 title = {Markovian Models for Sequential Data},
830 journal = {Neural Computing Surveys},
831 volume = {2},
832 year = {1999},
833 pages = {129--162},
834 abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many machine learning applications, especially for speech recognition. Furthermore? in the last few years, many new and promising probabilistic models related to {HMM}s have been proposed. We first summarize the basics of {HMM}s, arid then review several recent related learning algorithms and extensions of {HMM}s, including in particular hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s (which are conditional {HMM}s using neural networks to compute probabilities), weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this very active area.},
835 topics={Markov},cat={J},
836 }
837
838 @ARTICLE{Bengio99b,
839 author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
840 title = {Stochastic Learning of Strategic Equilibria for Auctions},
841 journal = {Neural Computation},
842 volume = {11},
843 number = {5},
844 year = {1999},
845 pages = {1199--1209},
846 abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
847 topics={Auction},cat={J},
848 }
849
850 @TECHREPORT{bengio:1990,
851 author = {Bengio, Yoshua},
852 title = {Learning a Synaptic Learning Rule},
853 number = {751},
854 year = {1990},
855 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
856 topics={BioRules},cat={T},
857 }
858
859 @INPROCEEDINGS{bengio:1990:snowbird,
860 author = {Bengio, Yoshua and R., De Mori},
861 title = {Recurrent networks with Radial Basis Functions for speech recognition},
862 booktitle = {1990 Neural Networks for Computing Conference},
863 year = {1990},
864 topics={Speech},cat={C},
865 }
866
867 @INPROCEEDINGS{bengio:1991:ijcnn,
868 author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
869 title = {Learning a Synaptic Learning Rule},
870 booktitle = {Proceedings of the International Joint Conference on Neural Networks},
871 year = {1991},
872 pages = {II--A969},
873 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1991_ijcnn.ps},
874 abstract = {This paper presents an original approach to neural modeling based on the idea of searching, with learning methods, for a synaptic learning rule which is biologically plausible, and yields networks that are able to learn to perform difficult tasks. The proposed method of automatically finding the learning rule relies on the idea of considering the synaptic modification rule as a parametric function. This function has local inputs and is the same in many neurons. The parameters that define this function can be estimated with known learning methods. For this optimization, we give particular attention to gradient descent and genetic algorithms. In both cases, estimation of this function consists of a joint global optimization of (a) the synaptic modification function, and (b) the networks that are learning to perform some tasks. The proposed methodology can be used as a tool to explore the missing pieces of the puzzle of neural networks learning. Both network architecture, and the learning function can be designed within constraints derived from biological knowledge.},
875 addressfr={Seattle, USA},topics={BioRules},cat={C},
876 }
877
878 @INPROCEEDINGS{bengio:1991:nnc,
879 author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
880 title = {Learning Synaptic Learning Rules},
881 booktitle = {Neural Networks for Computing},
882 year = {1991},
883 addressfr={Snowbird, Utah, USA},topics={BioRules},cat={C},
884 }
885
886 @INPROCEEDINGS{bengio:1991:snowbird,
887 author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
888 title = {Learning a Synaptic Learning Rule},
889 booktitle = {1991 Neural Networks for Computing Conference},
890 year = {1991},
891 topics={BioRules},cat={C},
892 }
893
894 @INPROCEEDINGS{bengio:1992:nn,
895 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
896 title = {Aspects th{\'{e}}oriques de l'optimisation d'une r{\`{e}}gle d'apprentissage},
897 booktitle = {Actes de la conf{\'{e}}rence Neuro-N{\^{\i}}mes 1992},
898 year = {1992},
899 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_nn.ps},
900 abstract = {Ayant expos{\'{e}} dans de pr{\'{e}}c{\'{e}}dentes publications (voir [Beng90, Beng92] notamment) l’id{\'{e}}e que l’on pouvait optimiser des r{\`{e}}gles d’apprentissage param{\'{e}}triques pour r{\'{e}}seaux de neurones, nous montrons dans cet article comment d{\'{e}}velopper, par la m{\'{e}}thode du Lagrangien, le gradient n{\'{e}}cessaire {\`{a}} l’optimisation d’une r{\`{e}}gle d’apprentissage par descente du gradient. Nous pr{\'{e}}sentons aussi les bases th{\'{e}}oriques qui permettent d’{\'{e}}tudier la g{\'{e}}n{\'{e}}ralisation {\`{a}} de nouvelles t{\^{a}}ches d’une r{\`{e}}gle d’apprentissage dont les param{\`{e}}tres ont {\'{e}}t{\'{e}} estim{\'{e}}s {\`{a}} partir d’un certain ensemble de t{\^{a}}ches. Enfin, nous exposons bri{\`{e}}vement les r{\'{e}}sultats d’une exp{\'{e}}rience consistant {\`{a}} trouver, par descente du gradient, une r{\`{e}}gle d’apprentissage pouvant r{\'{e}}soudre plusieurs t{\^{a}}ches bool{\'{e}}ennes lin{\'{e}}airement et non lin{\'{e}}airement s{\'{e}}parables.},
901 addressfr={N{\^i}es, France},topics={BioRules},cat={C},
902 }
903
904 @INPROCEEDINGS{bengio:1992:oban,
905 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
906 title = {On the Optimization of a Synaptic Learning rule},
907 booktitle = {Conference on Optimality in Biological and Artificial Networks},
908 year = {1992},
909 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_oban.ps},
910 abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia). Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
911 addressfr={Dallas, USA},topics={BioRules},cat={C},
912 }
913
914 @INPROCEEDINGS{bengio:1992:snowbird,
915 author = {Bengio, Yoshua},
916 title = {Representations Based on Articulatory Dynamics for Speech Recognition},
917 booktitle = {1992 Neural Networks for Computing Conference},
918 year = {1992},
919 topics={PriorKnowledge,Speech},cat={C},
920 }
921
922 @INPROCEEDINGS{bengio:1993:icann,
923 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
924 editor = {Gielen, S. and Kappen, B.},
925 title = {Generalization of a Parametric Learning Rule},
926 booktitle = {{ICANN} '93: Proceedings of the International Conference on Artificial Neural Networks},
927 year = {1993},
928 pages = {502},
929 publisher = {Springer-Verlag},
930 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1993_icann.ps},
931 abstract = {In previous work ([4,2,1]) we discussed the subject of parametric learning rules for neural networks. In this article, we present a theoretical basis permitting to study the generalization property of a learning rule whose parameters are estimated from a set of learning tasks. By generalization, we mean the possibility of using the learning rule to learn solve new tasks. Finally, we describe simple experiments on two-dimensional categorization tasks and show how they corroborate the theoretical results.},
932 addressfr={Amsterdam, Pays-Bas},topics={BioRules},cat={C},
933 }
934
935 @INPROCEEDINGS{bengio:1993:snowbird,
936 author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
937 title = {The Problem of Learning Long-Term Dependencies in Recurrent Networks},
938 booktitle = {1993 Neural Networks for Computing Conference},
939 year = {1993},
940 topics={LongTerm},cat={C},
941 }
942
943 @TECHREPORT{bengio:1994,
944 author = {Bengio, Yoshua and Frasconi, Paolo},
945 title = {An {EM} Approach to Learning Sequential Behavior},
946 number = {DSI 11-94},
947 year = {1994},
948 institution = {Universita di Firenze, Dipartimento di Sistemi e Informatica},
949 topics={LongTerm},cat={T},
950 }
951
952 @INPROCEEDINGS{bengio:1994:acfas,
953 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
954 title = {Optimisation d'une r{\`{e}}gle d'apprentissage pour r{\'{e}}seaux de neurones artificiels},
955 booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
956 year = {1994},
957 topics={BioRules},cat={C},
958 }
959
960 @INPROCEEDINGS{bengio:1994:snowbird,
961 author = {Bengio, Yoshua and Frasconi, Paolo},
962 title = {An {EM} Algorithm for Target Propagation},
963 booktitle = {1994 Neural Networks for Computing Conference},
964 year = {1994},
965 topics={LongTerm},cat={C},
966 }
967
968 @INPROCEEDINGS{bengio:1994:wcci,
969 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
970 title = {Use of Genetic Programming for the Search of a New Learning Rule for Neural Networks},
971 booktitle = {Proceedings of the First Conference on Evolutionary Computation, {IEEE} World Congress on Computational Intelligence},
972 year = {1994},
973 pages = {324--327},
974 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1994_wcci.ps},
975 abstract = {In previous work ([1,2,3]), we explained how to use standard optimization methods such as simulated annealing, gradient descent and genetic algorithms to optimize a parametric function which could be used as a learning rule for neural networks. To use these methods, we had to choose a fixed number of parameters and a rigid form for the learning rule. In this article, we propose to use genetic programming to find not only the values of rule parameters but also the optimal number of parameters and the form of the rule. Experiments on classification tasks suggest genetic programming finds better learning rules than other optimization methods. Furthermore, the best rule found with genetic programming outperformed the well-known backpropagation algorithm for a given set of tasks.},
976 topics={BioRules},cat={C},
977 }
978
979 @INPROCEEDINGS{bengio:1994b:acfas,
980 author = {Bengio, Yoshua and Frasconi, Paolo},
981 title = {R{\'{e}}seaux de neurones {M}arkoviens pour l'inf{\'{e}}rence grammaticale},
982 booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
983 year = {1994},
984 topics={Markov,Language},cat={C},
985 }
986
987 @ARTICLE{bengio:1995:npl,
988 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
989 title = {On the Search for New Learning Rules for {ANN}s},
990 journal = {Neural Processing Letters},
991 volume = {2},
992 number = {4},
993 year = {1995},
994 pages = {26--30},
995 abstract = {In this paper, we present a framework where a learning rule can be optimized within a parametric learning rule space. We define what we call parametric learning rules and present a theoretical study of their generalization properties when estimated from a set of learning tasks and tested over another set of tasks. We corroborate the results of this study with practical experiments.},
996 topics={BioRules},cat={J},
997 }
998
999 @INCOLLECTION{bengio:1995:oban,
1000 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
1001 editor = {Levine, D. S. and Elsberry, W. R.},
1002 title = {{O}n the Optimization of a Synaptic Learning Rule},
1003 booktitle = {Optimality in Biological and Artificial Networks},
1004 year = {1995},
1005 publisher = {Lawrence Erlbaum},
1006 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1995_oban.pdf},
1007 abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia), Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
1008 topics={BioRules},cat={B},
1009 }
1010
1011 @TECHREPORT{bengio:1996:udem,
1012 author = {Bengio, Yoshua and Bengio, Samy},
1013 title = {Training Asynchronous Input/Output Hidden {M}arkov Models},
1014 number = {1013},
1015 year = {1996},
1016 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
1017 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1996_udem.ps},
1018 topics={Markov},cat={T},
1019 }
1020
1021 @INPROCEEDINGS{bengio:1997:snowbird,
1022 author = {Bengio, Yoshua and Bengio, Samy and Singer, Yoram and Isabelle, Jean-Fran{\c c}ois},
1023 title = {On the Clusterization of Probabilistic Transducers},
1024 booktitle = {1997 Neural Networks for Computing Conference},
1025 year = {1997},
1026 topics={HighDimensional},cat={C},
1027 }
1028
1029 @INPROCEEDINGS{bengio:1998:snowbird,
1030 author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
1031 title = {Stochastic Learning of Strategic Equilibria for Auctions},
1032 booktitle = {Learning Conference},
1033 year = {1998},
1034 topics={Auction},cat={C},
1035 }
1036
1037 @TECHREPORT{bengio:1998:udem,
1038 author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
1039 title = {Stochastic Learning of Strategic Equilibria for Auctions},
1040 number = {1119},
1041 year = {1998},
1042 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
1043 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1998_udem.pdf},
1044 abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
1045 topics={Auction},cat={T},
1046 }
1047
1048 @INPROCEEDINGS{bengio:1999:snowbird,
1049 author = {Bengio, Yoshua and Latendresse, Simon and Dugas, Charles},
1050 title = {Gradient-Based Learning of Hyper-Parameters},
1051 booktitle = {Learning Conference},
1052 year = {1999},
1053 topics={ModelSelection},cat={C},
1054 }
1055
1056 @INPROCEEDINGS{bengio:1999:titration,
1057 author = {Bengio, Yoshua and Brault, J-J. and Major, Fran{\c c}ois and Neal, R. and Pigeon, Steven},
1058 title = {Learning Algorithms for Sorting Compounds from Titration Curves},
1059 booktitle = {Symposium on New Perspectives for Computer-Aided Drug Design},
1060 year = {1999},
1061 topics={Speech},cat={C},
1062 }
1063
1064 @ARTICLE{bengio:2000:ieeetrnn,
1065 author = {Bengio, Samy and Bengio, Yoshua},
1066 title = {Taking on the Curse of Dimensionality in Joint Distributions Using Neural Networks},
1067 journal = {IEEE Transaction on Neural Networks special issue on data mining and knowledge discovery},
1068 volume = {11},
1069 number = {3},
1070 year = {2000},
1071 pages = {550--557},
1072 abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow at most as the square of the number of variables, using a multi_layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables (thus reducing significantly the number of parameters). Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
1073 topics={HighDimensional,Unsupervised,Mining},cat={J},
1074 }
1075
1076 @INPROCEEDINGS{bengio:2000:nips,
1077 author = {Bengio, Yoshua and Bengio, Samy},
1078 title = {Modeling High-Dimensional Discrete Data with Multi-Layer Neural Networks},
1079 year = {2000},
1080 pages = {400--406},
1081 crossref = {NIPS12-shorter},
1082 abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow only at most as the square of the number of variables, using a multi-layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables. Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
1083 topics={HighDimensional,Unsupervised},cat={C},
1084 }
1085
1086 @ARTICLE{bengio:2003,
1087 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
1088 title = {A Neural Probabilistic Language Model},
1089 volume = {3},
1090 year = {2003},
1091 pages = {1137--1155},
1092 crossref = {JMLR-shorter},
1093 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
1094 topics={Markov,Unsupervised,Language},cat={J},
1095 }
1096
1097 @TECHREPORT{bengio:socs-1990,
1098 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
1099 title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
1100 number = {TR-SOCS-90.22},
1101 year = {1990},
1102 institution = {School of Computer Science, McGill University},
1103 topics={Markov},cat={T},
1104 }
1105
1106 @INPROCEEDINGS{bengioc:1994:acfas,
1107 author = {Bengio, Yoshua and {LeCun}, Yann},
1108 title = {Reconnaissance de mots manuscrits avec r{\'{e}}seaux de neurones et mod{\`{e}}les de {M}arkov},
1109 booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
1110 year = {1994},
1111 topics={Markov,Speech},cat={C},
1112 }
1113
1114 @TECHREPORT{Bengio_Bottou92,
1115 author = {Bengio, Yoshua and Bottou, {L{\'{e}}on}},
1116 title = {A New Approach to Estimating Probability Density Functions with Artificial Neural Networks},
1117 number = {TR-92.02},
1118 year = {1992},
1119 institution = {Massachusetts Institute of Technology, Dept. Brain and Cognitive Sciences},
1120 topics={HighDimensional},cat={T},
1121 }
1122
1123 @INCOLLECTION{bengio_extension_nips_2003,
1124 author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Delalleau, Olivier and Le Roux, Nicolas and Ouimet, Marie},
1125 keywords = {dimensionality reduction, eigenfunctions learning, Isomap, kernel {PCA}, locally linear embedding, Nystrom formula, spectral methods},
1126 title = {Out-of-Sample Extensions for {LLE}, Isomap, {MDS}, Eigenmaps, and Spectral Clustering},
1127 year = {2004},
1128 crossref = {NIPS16-shorter},
1129 abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides a unified framework for extending Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (for dimensionality reduction) as well as for Spectral Clustering. This framework is based on seeing these algorithms as learning eigenfunctions of a data-dependent kernel. Numerical experiments show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms due to the choice of training data.},
1130 topics={HighDimensional,Kernel,Unsupervised},cat={C},
1131 }
1132
1133 @ARTICLE{Bengio_Gingras98a,
1134 author = {Bengio, Yoshua and Gingras, Fran{\c c}ois and Goulard, Bernard and Lina, Jean-Marc},
1135 title = {Gaussian Mixture Densities for Classification of Nuclear Power Plant Data},
1136 journal = {Computers and Artificial Intelligence},
1137 volume = {17},
1138 number = {2-3},
1139 year = {1998},
1140 pages = {189--209},
1141 abstract = {In this paper we are concerned with the application of learning algorithms to the classification of reactor states in nuclear plants. Two aspects must be considered, (1) some types of events (e.g., abnormal or rare) will not appear in the data set, but the system should be able to detect them, (2) not only classification of signals but also their interpretation are important for nuclear plant monitoring. We address both issues with a mixture of mixtures of Gaussians in which some parameters are shared to reflect the similar signals observed in different states of the reactor. An {EM} algorithm for these shared Gaussian mixtures is presented. Experimental results on nuclear plant data demonstrate the advantages of the proposed approach with respect to the above two points.},
1142 topics={Mining},cat={J},
1143 }
1144
1145 @ARTICLE{Bengio_Gingras98b,
1146 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
1147 title = {Handling Asynchronous or Missing Financial Data with Recurrent Networks},
1148 journal = {International Journal of Computational Intelligence and Organizations},
1149 volume = {1},
1150 number = {3},
1151 year = {1998},
1152 pages = {154--163},
1153 abstract = {An important issue with many sequential data analysis problems, such as those encountered in financial data sets, is that different variables are known at different frequencies, at different times (asynchronicity), or are sometimes missing. To address this issue we propose to use recurrent networks with feedback into the input units, based on two fundamental ideas. The first motivation is that the “filled-in” value of the missing variable may not only depend in complicated ways on the value of this variable in the past of the sequence but also on the current and past values of other variables. The second motivation is that, for the purpose of making predictions or taking decisions, it is not always necessary to fill in the best possible value of the missing variables. In fact, it is sufficient to fill in a value which helps the system make better predictions or decisions. The advantages of this approach are demonstrated through experiments on several tasks.},
1154 topics={Finance,Missing},cat={J},
1155 }
1156
1157 @INPROCEEDINGS{Bengio_icassp90,
1158 author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Normandin, Yves},
1159 title = {A Hybrid Coder for Hidden {M}arkov Models Using a Recurrent Neural Network},
1160 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
1161 year = {1990},
1162 pages = {537--540},
1163 topics={Markov,Speech},cat={C},
1164 }
1165
1166 @INPROCEEDINGS{Bengio_LeCun94,
1167 author = {Bengio, Yoshua and {LeCun}, Yann and Henderson, Donnie},
1168 title = {Globally Trained Handwritten Word Recognizer using Spatial Representation, Space Displacement Neural Networks and Hidden {M}arkov Models},
1169 year = {1994},
1170 pages = {937--944},
1171 crossref = {NIPS6-shorter},
1172 abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
1173 topics={Speech},cat={C},
1174 }
1175
1176 @ARTICLE{Bengio_LeCun95,
1177 author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
1178 title = {LeRec: A {NN}/{HMM} Hybrid for On-Line Handwriting Recognition},
1179 journal = {Neural Computation},
1180 volume = {7},
1181 number = {6},
1182 year = {1995},
1183 pages = {1289--1303},
1184 abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
1185 topics={PriorKnowledge,Speech},cat={J},
1186 }
1187
1188 @ARTICLE{Bengio_prel92,
1189 author = {Bengio, Yoshua and Gori, Marco and De Mori, Renato},
1190 title = {Learning the Dynamic Nature of Speech with Back-propagation for Sequences},
1191 journal = {Pattern Recognition Letters},
1192 volume = {13},
1193 number = {5},
1194 year = {1992},
1195 pages = {375--385},
1196 note = {(Special issue on Artificial Neural Networks)},
1197 topics={Speech},cat={J},
1198 }
1199
1200 @ARTICLE{Bengio_trnn92,
1201 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
1202 title = {Global Optimization of a Neural Network-Hidden {M}arkov Model Hybrid},
1203 journal = {IEEE Transactions on Neural Networks},
1204 volume = {3},
1205 number = {2},
1206 year = {1992},
1207 pages = {252--259},
1208 topics={Markov},cat={J},
1209 }
1210
1211 @TECHREPORT{Bergstra+2009,
1212 author = {Bergstra, James and Desjardins, Guillaume and Lamblin, Pascal and Bengio, Yoshua},
1213 title = {Quadratic Polynomials Learn Better Image Features},
1214 number = {1337},
1215 year = {2009},
1216 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
1217 abstract = {The affine-sigmoidal hidden unit (of the form $\sigma(ax+b)$)
1218 is a crude predictor of neuron response in visual area V1.
1219 More descriptive models of V1 have been advanced that are no more computationally expensive,
1220 yet artificial neural network research continues to focus on networks of affine-sigmoidal models.
1221 This paper identifies two qualitative differences between the affine-sigmoidal hidden unit
1222 and a particular recent model of V1 response:
1223 a) the presence of a low-rank quadratic term in the argument to $\sigma$,
1224 and b) the use of a gentler non-linearity than the $\tanh$ or logistic sigmoid.
1225 We evaluate these model ingredients by training single-layer
1226 neural networks to solve three image classification tasks.
1227 We experimented with fully-connected hidden units,
1228 as well as locally-connected units and convolutional units
1229 that more closely mimic the function and connectivity of the visual system.
1230 On all three tasks, both the quadratic interactions and the gentler non-linearity
1231 lead to significantly better generalization.
1232 The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.}
1233 }
1234
1235 @MISC{bergstra+al:2010-scipy,
1236 author = {Bergstra, James},
1237 title = {Optimized Symbolic Expressions and {GPU} Metaprogramming with Theano},
1238 year = {2010},
1239 howpublished = {{SciPy}},
1240 note = {Oral}
1241 }
1242
1243 @MISC{bergstra+al:2010-sharcnet,
1244 author = {Bergstra, James and Bengio, Yoshua},
1245 title = {{GPU} Programming with Theano},
1246 year = {2010},
1247 howpublished = {{SHARCNET} Research Day},
1248 note = {Oral}
1249 }
1250
1251 @MISC{bergstra+al:2010snowbird,
1252 author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Turian, Joseph and Desjardins, Guillaume and Pascanu, Razvan and Erhan, Dumitru and Delalleau, Olivier and Bengio, Yoshua},
1253 title = {Deep Learning on {GPU}s with Theano},
1254 booktitle = {The Learning Workshop},
1255 year = {2010},
1256 note = {Oral}
1257 }
1258
1259 @INPROCEEDINGS{Bergstra+Bengio-2009,
1260 author = {Bergstra, James and Bengio, Yoshua},
1261 title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
1262 year = {2009},
1263 crossref = {NIPS22}
1264 }
1265
1266 @ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006,
1267 author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1268 title = {Aggregate Features and AdaBoost for Music Classification},
1269 journal = {Machine Learning},
1270 volume = {65},
1271 year = {2006},
1272 pages = {473--484},
1273 issn = {0885-6125},
1274 abstract = {We present an algorithm that predicts musical genre and artist from an audio waveform. Our method uses the ensemble learner ADABOOST to select from a set of audio features that have been extracted from segmented audio and then aggregated. Our classifier proved to be the most effective method for genre classification at the recent MIREX 2005 international contests in music information extraction, and the second-best method for recognizing artists. This paper describes our method in detail, from feature extraction to song classification, and presents an evaluation of our method on three genre databases and two artist-recognition databases. Furthermore, we present evidence collected from a variety of popular features and classifiers that the technique of classifying features aggregated over segments of audio is better than classifying either entire songs or individual short-timescale features.},
1275 PDF = {papers/2006_ml_draft.pdf},
1276 SOURCE = {OwnPublication},
1277 }
1278
1279 @INPROCEEDINGS{bergstra+lacoste+eck:2006,
1280 author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas},
1281 title = {Predicting Genre Labels for Artists using FreeDB},
1282 booktitle = {Proc. 7th International Conference on Music Information Retrieval (ISMIR)},
1283 year = {2006},
1284 SOURCE = {OwnPublication},
1285 PDF = {papers/2006_ismir_freedb.pdf},
1286 }
1287
1288 @INPROCEEDINGS{bergstra+mandel+eck:2010,
1289 author = {Bergstra, James and Mandel, Michael and Eck, Douglas},
1290 title = {Scalable Genre and Tag Prediction with Spectral Covariance},
1291 booktitle = {{ISMIR}},
1292 year = {2010},
1293 note = {accepted}
1294 }
1295
1296 @MASTERSTHESIS{Bergstra-Msc-2006,
1297 author = {Bergstra, James},
1298 keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale},
1299 title = {Algorithms for Classifying Recorded Music by Genre},
1300 year = {2006},
1301 school = {Universit{\'{e}} de Montreal},
1302 abstract = {Ce m{\'{e}}moire traite le probl{\`{e}}me de la classification automatique de signaux musicaux par genre. Dans un premier temps, je pr{\'{e}}sente une technique utilisant l'apprentissage machine pour classifier des statistiques extraites sur des segments du signal sonore. Malgr{\'{e}} le fait que cette technique a d{\'{e}}j{\`{a}} {\'{e}}t{\'{e}} explor{\'{e}}e, mon m{\'{e}}moire est le premier {\`{a}} investiguer l'influence de la longueur et de la quantit{\'{e}} de ces segments sur le taux de classification. J'explore {\'{e}}galement l'importance d'avoir des segments contigus dans le temps. Les segments d'une {\`{a}} trois secondes apportent une meilleure performance, mais pour ce faire, ils doivent {\^{e}}tre suffisamment nombreux. Il peut m{\^{e}}me {\^{e}}tre utile d'augmenter la quantit{\'{e}} de segments jusqu'{\`{a}} ce qu'ils se chevauchent. Dans les m{\^{e}}mes exp{\'{e}}riences, je pr{\'{e}}sente une formulation alternative des descripteurs d'audio nomm{\'{e}}e Melfrequency Cepstral Coefficient (MFCC) qui am{\`{e}}ne un taux de classification de 81 \% sur un jeux de donn{\'{e}}es pour lequel la meilleure performance publi{\'{e}}e est de 71 \%. Cette m{\'{e}}thode de segmentation des chansons, ainsi que cette formulation alternative, ont pour but d'am{\'{e}}liorer l'algorithme gagnant du concours de classification de genre de MIREX 2005, d{\'{e}}velopp{\'{e}} par Norman Casagrande et moi. Ces exp{\'{e}}riences sont un approfondissement du travail entam{\'{e}} par Bergstra et al. [2006a], qui d{\'{e}}crit l'algorithme gagnant de ce concours.
1303 Dans un deuxi{\`{e}}me temps, je pr{\'{e}}sent une m{\'{e}}thode qui utilise FreeDB, une base de donn{\'{e}}es d'information sur les albums, pour attribuer {\`{a}} un artiste une distribution de probabilit{\'{e}} sur son genre. Avec une petite base de donn{\'{e}}es, faite {\`{a}} la main, je montre qu'il y a une haute corr{\'{e}}lation entre cette distribution et l'{\'{e}}tiquette de genre traditionnel. Bien qu'il reste {\`{a}} d{\'{e}}montrer que cette m{\'{e}}thode est utile pour organiser une collection de musique, ce r{\'{e}}sultat sugg{\`{e}}re qu'on peut maintenant {\'{e}}tiqueter de grandes bases de musique automatiquement {\`{a}} un faible co{\^{u}}t, et par cons{\'{e}}quent de poursuivre plus facilement la recherche en classification {\`{a}} grande {\'{e}}chelle. Ce travail sera publi{\'{e}} comme Bergstra et al. [2006b] {\`{a}} ISMIR 2006.}
1304 }
1305
1306 @INPROCEEDINGS{bergstra:2010cosyne,
1307 author = {Bergstra, James and Bengio, Yoshua and Lamblin, Pascal and Desjardins, Guillaume and Louradour, Jerome},
1308 title = {Image classification with complex cell neural networks},
1309 booktitle = {Computational and systems neuroscience (COSYNE)},
1310 year = {2010},
1311 note = {Poster},
1312 url = {http://www.frontiersin.org/conferences/individual_abstract_listing.php?conferid=770&pap=3626&ind_abs=1&pg=335},
1313 doi = {10.3389/conf.fnins.2010.03.00334}
1314 }
1315
1316 @INPROCEEDINGS{biaslearn:2000:ijcnn,
1317 author = {Ghosn, Joumana and Bengio, Yoshua},
1318 title = {Bias Learning, Knowledge Sharing},
1319 booktitle = {International Joint Conference on Neural Networks 2000},
1320 volume = {I},
1321 year = {2000},
1322 pages = {9--14},
1323 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/ijcnn_manifold.pdf},
1324 abstract = {Biasing the hypothesis space of a learner has been shown to improve generalisation performances. Methods for achieving this goal have been proposed, that range from deriving and introducing a bias into a learner to automatically learning the bias. In the latter case, most methods learn the bias by simultaneously training several related tasks derived from the same domain and imposing constraints on their parameters. We extend some of the ideas presented in this field and describe a new model that parameterizes the parameters of each task as a function of an affine manifold defined in parameter space and a point lying on the manifold. An analysis of variance on a class of learning tasks is performed that shows some significantly improved performances when using the model.},
1325 topics={MultiTask},cat={C},
1326 }
1327
1328 @ARTICLE{biaslearn:2003:tnn,
1329 author = {Ghosn, Joumana and Bengio, Yoshua},
1330 title = {Bias Learning, Knowledge Sharing},
1331 journal = {IEEE Transaction on Neural Networks},
1332 volume = {14},
1333 number = {4},
1334 year = {2003},
1335 pages = {748--765},
1336 abstract = {Biasing properly the hypothesis space of a learner has been shown to improve generalization performance. Methods for achieving this goal have been proposed, that range from designing and introducing a bias into a learner to automatically learning the bias. Multitask learning methods fall into the latter category. When several related tasks derived from the same domain are available, these methods use the domain-related knowledge coded in the training examples of all the tasks as a source of bias. We extend some of the ideas presented in this field and describe a new approach that identifies a family of hypotheses, represented by a manifold in hypothesis space, that embodies domain-related knowledge. This family is learned using training examples sampled from a group of related tasks. Learning models trained on these tasks are only allowed to select hypotheses that belong to the family. We show that the new approach encompasses a large variety of families which can be learned. A statistical analysis on a class of related tasks is performed that shows significantly improved performances when using this approach.},
1337 topics={MultiTask},cat={J},
1338 }
1339
1340 @MASTERSTHESIS{Boisvert-Mcs-2005,
1341 author = {Boisvert, Maryse},
1342 keywords = {Algorithme {EM} , D{\'{e}}composition en valeurs singuli{\`{e}}res , D{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique , Mod{\`{e}}les graphiques, WordNet },
1343 title = {R{\'{e}}duction de dimension pour mod{\`{e}}les graphiques probabilistes appliqu{\'{e}}s {\`{a}} la d{\'{e}}sambiguisation s{\'{e}}mantique},
1344 year = {2005},
1345 school = {Universit{\'{e}} de Montr{\'{e}}al}
1346 }
1347
1348 @INPROCEEDINGS{bonneville98,
1349 author = {Bonneville, Martin and Meunier, Jean and Bengio, Yoshua and Soucy, Jean-Paul},
1350 title = {Support Vector Machines for Improving the classification of Brain Pet Images},
1351 booktitle = {SPIE Medical Imaging},
1352 year = {1998},
1353 topics={Kernel},cat={C},
1354 }
1355
1356 @INPROCEEDINGS{Bottou+Bengio95,
1357 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua},
1358 title = {Convergence Properties of the {K}-Means Algorithm},
1359 year = {1995},
1360 pages = {585--592},
1361 crossref = {NIPS7-shorter},
1362 abstract = {This paper studies the convergence properties of the well known K-Means clustering algorithm. The K-Means algorithm can be described either as a gradient descent algorithm or by slightly extending the mathematics of the {EM} algorithm to this hard threshold case. We show that the K-Means algorithm actually minimizes the quantization error using the very fast Newton algorithm.},
1363 topics={Unsupervised},cat={C},
1364 }
1365
1366 @ARTICLE{bottou-98,
1367 author = {Bottou, {L{\'{e}}on} and Haffner, Patrick and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
1368 title = {High Quality Document Image Compression with {DjVu}},
1369 journal = {Journal of Electronic Imaging},
1370 volume = {7},
1371 number = {3},
1372 year = {1998},
1373 pages = {410--425},
1374 topics={Compression},cat={J},
1375 }
1376
1377 @INPROCEEDINGS{Bottou-dcc98,
1378 author = {Bottou, {L{\'{e}}on} and G. Howard, Paul and Bengio, Yoshua},
1379 editor = {Society, {IEEE} Computer},
1380 title = {The Z-Coder Adaptive Binary Coder},
1381 booktitle = {Data Compression Conference},
1382 year = {1998},
1383 url = {http://leon.bottou.org/papers/bottou-howard-bengio-98},
1384 topics={Compression},cat={C},
1385 }
1386
1387 @INPROCEEDINGS{bottou-lecun-bengio-97,
1388 author = {Bottou, {L{\'{e}}on} and {LeCun}, Yann and Bengio, Yoshua},
1389 title = {Global Training of Document Processing Systems using Graph Transformer Networks},
1390 booktitle = {Proc. of Computer Vision and Pattern Recognition},
1391 year = {1997},
1392 pages = {490--494},
1393 publisher = {IEEE},
1394 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.ps.gz},
1395 topics={PriorKnowledge,Speech},cat={C},
1396 }
1397
1398 @TECHREPORT{bottou96TR,
1399 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann},
1400 title = {Document analysis with transducers},
1401 number = {Technical Memorandum HA615600-960701-01TM},
1402 year = {1996},
1403 institution = {AT\&T Labs},
1404 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/transducer-tm.ps.gz},
1405 topics={HighDimensional},cat={T},
1406 }
1407
1408 @TECHREPORT{bottou97TR,
1409 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and G. Howard, Paul},
1410 title = {Z-Coder: A Fast Adaptive Binary Arithmetic Coder},
1411 number = {Technical Memorandum HA615600-970721-02TM},
1412 year = {1997},
1413 institution = {AT\&T Labs},
1414 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/zcoder-tm.ps.gz},
1415 topics={Compression},cat={T},
1416 }
1417
1418 @MASTERSTHESIS{Bouchard-Msc-2007,
1419 author = {Bouchard, Lysiane},
1420 keywords = {auditory cortex, fMRI, linear classifier, logistic regression, na{\"{\i}}ve bayesian gaussian model, neuroimaging, spectro-temporal modulation, support vectors machine},
1421 title = {Analyse par apprentissage automatique des r{\'{e}}ponses fMRI du cortex auditif {\`{a}} des modulations spectro-temporelles.},
1422 year = {2009},
1423 school = {Universit{\'{e}} de Montr{\'{e}}al},
1424 abstract = {The application of linear machine learning classifiers to the analysis of brain imaging data (fMRI) has led to several interesting breakthroughs in recent years. These classifiers combine the responses of the voxels to detect and categorize different brain states. They allow a more agnostic analysis than conventional fMRI analysis that systematically treats weak and distributed patterns as unwanted noise. In this project, we use such classifiers to validate an hypothesis concerning the encoding of sounds in the human brain. More precisely, we attempt to locate neurons tuned to spectral and temporal modulations in sound. We use fMRI recordings of brain responses of subjects listening to 49 different spectro-temporal modulations. The analysis of fMRI data through linear classifiers is not yet a standard procedure in this field. Thus, an important objective of this project, in the long term, is the development of new machine learning algorithms specialized for neuroimaging data. For these reasons, an important part of the experiments is dedicated to studying the behaviour of the classifiers. We are mainly interested in 3 standard linear classifiers, namely the support vectors machine algorithm (linear), the logistic regression algorithm (regularized) and the na{\"{\i}}ve bayesian gaussian model (shared variances).}
1425 }
1426
1427 @PHDTHESIS{Boufaden-Phd-2005,
1428 author = {Boufaden, Narj{\`{e}}s},
1429 title = {Extraction d’information {\`{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es},
1430 year = {2005},
1431 school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel}
1432 }
1433
1434 @INPROCEEDINGS{Carreau+Bengio-2007,
1435 author = {Carreau, Julie and Bengio, Yoshua},
1436 title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data},
1437 booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
1438 year = {2007},
1439 publisher = {Omnipress},
1440 abstract = {We propose an estimator for the conditional density p(Y|X) that can adapt for asymmetric heavy tails which might depend on X. Such estimators have important applications in finance and insurance. We draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional density estimator, the parameters of the mixture estimator can be seen as
1441 functions of X and these functions learned. We show experimentally that this approach better models the conditional density in terms of likelihood than compared competing algorithms : conditional mixture models with other types of components and multivariate nonparametric models.},
1442 date={21-24}
1443 }
1444
1445 @ARTICLE{Carreau+Bengio-2009,
1446 author = {Carreau, Julie and Bengio, Yoshua},
1447 title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distributio\ n},
1448 journal = {IEEE Transactions on Neural Networks},
1449 volume = {20},
1450 number = {7},
1451 year = {2009},
1452 pages = {1087--1101},
1453 issn = {1045-9227},
1454 abstract = {In many cases, we observe some variables X that contain predictive information over a scalar variable of interest Y, with (X,Y) pairs observed in a training set. We can take advantage of this information to estimate the conditional density P(Y\X = x). In this paper, we propose a conditional mixture model with hybrid {Pareto} components to estimate P(Y\X = x).The hybrid {Pareto} is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. A third parameter, in addition to the location and spread parameters of the Gaussian, controls the heaviness of the upper tail. Using the hybrid {Pareto} in a mixture model results in a nonparametric estimator that can adapt to multimodality, asymmetry, and heavy tails. A conditional density estimator is built by modeling the parameters of the mixture estimator as functions of X. We use a neural network to implement these functions. Such conditional density estimators have important applications in many domains such as finance and insurance. We show experimentally that this novel approach better models the conditional density in terms of likelihood, compared to competing algorithms: conditional mixture models with other types of components and a classical kernel-based nonparametric model.}
1455 }
1456
1457 @ARTICLE{Carreau+Bengio-extreme-2009,
1458 author = {Carreau, Julie and Bengio, Yoshua},
1459 title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tailed Data: the univariate case},
1460 journal = {Extremes},
1461 volume = {12},
1462 number = {1},
1463 year = {2009},
1464 pages = {53--76},
1465 abstract = {Density estimators that can adapt to asymmetric heavy tails are required in many applications such as finance and insurance. Extreme Value Theory (EVT) has developped principled methods based on asymptotic results to estimate the tails of most distributions. However, the finite sample approximation might introduce a severe bias in many cases. Moreover, the full range of the distribution is often needed, not only the tail area. On the other hand, non-parametric methods, while being powerful where data are abundant, fail to extrapolate properly in the tail area. We put forward a non-parametric density estimator that brings together the strengths of non-parametric density estimation and of EVT. A hybrid {Pareto} distribution that can be used in a mixture model is proposed to extend the generalized {Pareto} (GP) to the whole real axis. Experiments on simulated data show the following. On one hand, the mixture of hybrid {Pareto}s converges faster in terms of log-likelihood and provides good estimates of the tail of the distributions when compared with other density estimators including the GP distribution. On the other hand, the mixture of hybrid {Pareto}s offers an alternate way to estimate the tail index which is comparable to the one estimated with the standard GP methodology. The mixture of hybrids is also evaluated on the Danish fire insurance data set.}
1466 }
1467
1468 @PHDTHESIS{Carreau-PhD-2007,
1469 author = {Carreau, Julie},
1470 keywords = {density estimation, extreme values, generalized {Pareto} distribution, heavy-tailed distribution, mixture of distributions, neural networks},
1471 title = {Mod{\`{e}}les {Pareto} hybrides pour distributions asym{\'{e}}triques et {\`{a}} queues lourdes},
1472 year = {2007},
1473 school = {UdeM},
1474 abstract = {We put forward a class of density estimators that can adapt to asymmetric, multi-modal and heavy-tailed distributions. Such distributions occur in many application domains such as finance and insurance. Mixture of gaussians are flexible non-parametric density estimators that have good approximation properties when the number of components is well chosen with respect to the training set size. However, those models are performing poorly on heavy-tailed data because few observations occur in the tail area. To solve this problem, we resort to extreme value theory where methods based on sound parametric assumptions have been developped to enable extrapolation beyond the range of the observations. More precisely, we build on the PoT method that was developped in hydrology where PoT stands for "Peaks-over-Threshold". The observations exceeding a given threshold are modeled by the generalized {Pareto} distribution. This distribution can approximate arbitrarily well the tail of most distributions. We build a new distribution, the hybrid {Pareto}, by stitching together a truncated Normal and a generalized {Pareto} distribution. We impose continuity constraints at the junction point. The hybrid {Pareto} is thus a smooth distribution that can be used in a mixture model. The behavior of the upper tail of the hybrid is similar to the behavior of the generalized {Pareto} tail. Moreover, the threshold inherent in the the PoT methodology can now be defined implicitly as the junction point of the component with the heaviest tail. This component also determines the tail index of the mixture. Hence, the hybrid {Pareto} mixture offers an alternate way to estimate the tail index associated with heavy-tailed data. In several applications, information that has predictive power on the variable of interest is available. In that case, we want to model the conditional density of Y given X, the vector containing predictive information. When the distribution of Y given X is asymmetric, multi-modal and heavy-tailed, we propose to use a mixure of hybrid {Pareto}s whose parameters are functions of X. Those functions are implemented by means of a neural network with one hidden layer. Neural neworks are non-parametric models that can, in principle, approximate any continuous function. Experiments on artificial and real data sets show that the hybrid {Pareto} mixture, unconditional and conditional, outperforms other density estimators in terms of log-likelihood.}
1475 }
1476
1477 @INPROCEEDINGS{casagrande+eck+kegl:icmc2005,
1478 author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1479 title = {Geometry in Sound: A Speech/Music Audio Classifier Inspired by an Image Classifier},
1480 booktitle = {{Proceedings of the International Computer Music Conference (ICMC)}},
1481 year = {2005},
1482 pages = {207--210},
1483 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_icmc_casagrande.pdf},
1484 source={OwnPublication},
1485 sourcetype={Conference},
1486 }
1487
1488 @INPROCEEDINGS{casagrande+eck+kegl:ismir2005,
1489 author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1490 title = {Frame-Level Audio Feature Extraction using {A}da{B}oost},
1491 booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
1492 year = {2005},
1493 pages = {345--350},
1494 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir_casagrande.pdf},
1495 source={OwnPublication},
1496 sourcetype={Conference},
1497 }
1498
1499 @PROCEEDINGS{ccai2006,
1500 editor = {Lamontagne, Luc and Marchand, Mario},
1501 title = {Advances in Artificial Intelligence, 19th Conference of the Canadian Society for Computational Studies of Intelligence, Canadian AI 2006, Qu{\'{e}}bec City, Qu{\'{e}}bec, Canada, June 7-9, 2006, Proceedings},
1502 booktitle = {Canadian Conference on AI},
1503 series = {Lecture Notes in Computer Science},
1504 volume = {4013},
1505 year = {2006},
1506 publisher = {Springer}
1507 }
1508
1509 @INPROCEEDINGS{Chapados+Bengio-2006,
1510 author = {Chapados, Nicolas and Bengio, Yoshua},
1511 title = {The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization},
1512 booktitle = {AI06},
1513 year = {2006},
1514 pages = {491-502}
1515 }
1516
1517 @INPROCEEDINGS{Chapados+Bengio-2007,
1518 author = {Chapados, Nicolas and Bengio, Yoshua},
1519 title = {Forecasting Commodity Contract Spreads with Gaussian Process},
1520 booktitle = {13th Intarnational Conference on Computing in Economics and Finance},
1521 year = {2007},
1522 abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
1523 }
1524
1525 @ARTICLE{Chapados+Bengio-2008-JOC,
1526 author = {Chapados, Nicolas and Bengio, Yoshua},
1527 title = {Noisy K Best-Paths for Approximate Dynamic Programming with Application to Portfolio Optimization},
1528 journal = {Journal of Computers},
1529 volume = {2},
1530 number = {1},
1531 year = {2007},
1532 pages = {12--19},
1533 abstract = {We describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-bestpaths algorithm. We consider an application in financial portfolio management where we can train a controller to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating experimental results using a kernel-based controller architecture that would not normally be considered in traditional
1534 reinforcement learning or approximate dynamic programming.We further show that using a non-additive criterion (incremental Sharpe Ratio) yields a noisy K-best-paths extraction problem, that can give substantially improved performance.}
1535 }
1536
1537 @MASTERSTHESIS{Chapados-Msc-2000,
1538 author = {Chapados, Nicolas},
1539 title = {Crit{\`{e}}res d'optimisation d'algorithmes d'apprentissage en gestion de portefeuille},
1540 year = {2000},
1541 school = {Universit{\'{e}} de Montr{\'{e}}al}
1542 }
1543
1544 @INPROCEEDINGS{chapados2000,
1545 author = {Chapados, Nicolas and Bengio, Yoshua},
1546 title = {Cost Functions and Model Combination for {VaR}-Based Asset Allocation Using Neural Networks},
1547 booktitle = {Computational Finance 2000},
1548 year = {2000},
1549 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/compfin2000_final.pdf},
1550 abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees using both paradigms are significantly outperforming the benchmark market performance.},
1551 topics={Finance},cat={C},
1552 }
1553
1554 @ARTICLE{chapados:2001,
1555 author = {Chapados, Nicolas and Bengio, Yoshua},
1556 title = {Cost Functions and Model Combination for VaR--based Asset Allocation using Neural Networks},
1557 journal = {IEEE Transactions on Neural Networks},
1558 volume = {12},
1559 number = {4},
1560 year = {2001},
1561 pages = {890--906},
1562 abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we
1563 compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees
1564 using both paradigms are significantly outperforming the benchmark market performance.},
1565 topics={Finance},cat={J},
1566 }
1567
1568 @ARTICLE{chapados:2003,
1569 author = {Bengio, Yoshua and Chapados, Nicolas},
1570 title = {Extensions to Metric-Based Model Selection},
1571 year = {2003},
1572 crossref = {JMLR-shorter},
1573 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
1574 topics={ModelSelection,Finance},cat={J},
1575 }
1576
1577 @ARTICLE{chapelle:2001,
1578 author = {Chapelle, Olivier and Vapnik, Vladimir and Bengio, Yoshua},
1579 title = {Model Selection for Small Sample Regression},
1580 journal = {Machine Learning},
1581 year = {2001},
1582 abstract = {Model selection is an important ingredient of many machine learning algorithms, in particular when the sample size in small, in order to strike the right trade-off between overfitting and underfitting. Previous classical results for linear regression are based on an asymptotic analysis. We present a new penalization method for performing model selection for regression that is appropriate even for small samples. Our penalization is based on an accurate estimator of the ratio of the expected training error and the expected generalization error, in terms of the expected eigenvalues of the input covariance matrix.},
1583 topics={ModelSelection},cat={J},
1584 }
1585
1586 @INCOLLECTION{chapter-eval-longterm-2001,
1587 author = {Schmidhuber, Juergen and Hochreiter, Sepp and Bengio, Yoshua},
1588 editor = {Kolen, J. and Kremer, S.},
1589 title = {Evaluating Benchmark Problems by Random Guessing},
1590 booktitle = {Field Guide to Dynamical Recurrent Networks},
1591 year = {2001},
1592 publisher = {IEEE Press},
1593 topics={LongTerm},cat={B},
1594 }
1595
1596 @INCOLLECTION{chapter-gradient-document-2001,
1597 author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
1598 editor = {Haykin, S. and Kosko, B.},
1599 title = {Gradient-Based Learning Applied to Document Recognition},
1600 booktitle = {Intelligent Signal Processing},
1601 year = {2001},
1602 pages = {306--351},
1603 publisher = {IEEE Press},
1604 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-01a.pdf},
1605 abstract = {Multilayer Neural Networks trained with a backprppagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques.
1606 Real-life document recognition systems are composed of multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to monimize an overall peformance measure.
1607 Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks.
1608 A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with a global training technique to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
1609 topics={PriorKnowledge,Speech},cat={B},
1610 }
1611
1612 @INCOLLECTION{chapter-gradient-flow-2001,
1613 author = {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo},
1614 editor = {Kolen, J. and Kremer, S.},
1615 title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies},
1616 booktitle = {Field Guide to Dynamical Recurrent Networks},
1617 year = {2001},
1618 publisher = {IEEE Press},
1619 topics={LongTerm},cat={B},
1620 }
1621
1622 @INPROCEEDINGS{chemero+eck:1999,
1623 author = {Chemero, T. and Eck, Douglas},
1624 title = {An Exploration of Representational Complexity via Coupled Oscillators},
1625 booktitle = {{Proceedings of the Tenth Midwest Artificial Intelligence and Cognitive Science Society}},
1626 year = {1999},
1627 publisher = {MIT Press},
1628 url = {http://www.iro.umontreal.ca/~eckdoug/papers/1999_chemero.pdf},
1629 abstract = {We note some inconsistencies in a view of representation which takes {\it decoupling} to be of key importance. We explore these inconsistencies using examples of representational vehicles taken from coupled oscillator theory and suggest a new way to reconcile {\it coupling} with {\it absence}. Finally, we tie these views to a teleological definition of representation.},
1630 source={OwnPublication},
1631 sourcetype={Conference},
1632 }
1633
1634 @ARTICLE{ChemInfModel2006,
1635 author = {Erhan, Dumitru and {L'Heureux}, Pierre-Jean and Yue, Shi Yi and Bengio, Yoshua},
1636 title = {Collaborative Filtering on a Family of Biological Targets},
1637 journal = {J. Chem. Inf. Model.},
1638 volume = {46},
1639 number = {2},
1640 year = {2006},
1641 pages = {626--635},
1642 abstract = {Building a QSAR model of a new biological target for which few screening data are available is a statistical
1643 challenge. However, the new target may be part of a bigger family, for which we have more screening data.
1644 Collaborative filtering or, more generally, multi-task learning, is a machine learning approach that improves
1645 the generalization performance of an algorithm by using information from related tasks as an inductive
1646 bias. We use collaborative filtering techniques for building predictive models that link multiple targets to
1647 multiple examples. The more commonalities between the targets, the better the multi-target model that can
1648 be built. We show an example of a multi-target neural network that can use family information to produce
1649 a predictive model of an undersampled target. We evaluate JRank, a kernel-based method designed for
1650 collaborative filtering. We show their performance on compound prioritization for an HTS campaign and
1651 the underlying shared representation between targets. JRank outperformed the neural network both in the
1652 single- and multi-target models.},
1653 topics={Bioinformatic,MultiTask},cat={J},
1654 }
1655
1656 @TECHREPORT{collobert:2001:rr01-12,
1657 author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
1658 title = {A Parallel Mixture of {SVM}s for Very Large Scale Problems},
1659 number = {12},
1660 year = {2001},
1661 institution = {IDIAP},
1662 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/IDIAP-RR-01-12.ps},
1663 abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
1664 topics={Kernel},cat={T},
1665 }
1666
1667 @ARTICLE{collobert:2002,
1668 author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
1669 title = {Parallel Mixture of {SVM}s for Very Large Scale Problem},
1670 journal = {Neural Computation},
1671 year = {2002},
1672 abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
1673 topics={HighDimensional,Kernel},cat={J},
1674 }
1675
1676 @BOOK{collobert:2002:book,
1677 author = {Collobert, Ronan and Bengio, Yoshua and Bengio, Samy},
1678 editor = {Lee, S. W. and Verri, A.},
1679 title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
1680 booktitle = {Pattern Recognition with Support Vector Machines},
1681 series = {Lecture Notes in Computer Science},
1682 volume = {2388},
1683 year = {2002},
1684 publisher = {Springer-Verlag},
1685 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/2002_mixtures_svm.pdf},
1686 abstract = {A challenge for statistical learning is to deal with large data sets, e.g. in data mining. Popular learning algorithms such as Support Vector Machines have training time at least quadratic in the number of examples: they are hopeless to solve prolems with a million examples. We propose a "hard parallelizable mixture" methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a "gater" model in such a way that it becoms easy to learn an "expert" model separately in each region of the parition. A probabilistic extension and the use of a set of generative models allows representing a gater so that all pieces of the model are locally trained. For {SVM}s, time complexity appears empirically to locally grow linearly with the number of examples, while generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.},
1687 topics={Kernel},cat={B},
1688 }
1689
1690 @MISC{copyright-CTAI,
1691 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Dorion, Christian},
1692 title = {Commodity Trading Advisor Index},
1693 year = {2004-2009},
1694 howpublished = {copyright, and commercialized software license.}
1695 }
1696
1697 @MISC{copyright-PLearn,
1698 author = {Vincent, Pascal and Bengio, Yoshua},
1699 title = {{PLearn}, a {C++} Machine Learning Library},
1700 year = {1998-2009},
1701 howpublished = {copyright, public domain license.},
1702 url = {www.plearn.org}
1703 }
1704
1705 @ARTICLE{Cosi90,
1706 author = {Cosi, Piero and Bengio, Yoshua and De Mori, Renato},
1707 title = {Phonetically-based multi-layered networks for acoustic property extraction and automatic speech recognition},
1708 journal = {Speech Communication},
1709 volume = {9},
1710 number = {1},
1711 year = {1990},
1712 pages = {15--30},
1713 topics={PriorKnowledge,Speech},cat={J},
1714 }
1715
1716 @INCOLLECTION{courville+eck+bengio:nips2009,
1717 author = {Courville, Aaron and Eck, Douglas and Bengio, Yoshua},
1718 editor = {},
1719 title = {An Infinite Factor Model Hierarchy Via a Noisy-Or Mechanism},
1720 booktitle = {Neural Information Processing Systems Conference (NIPS) 22},
1721 year = {2009},
1722 pages = {405--413},
1723 publisher = {},
1724 url = {http://books.nips.cc/papers/files/nips22/NIPS2009_1100.pdf},
1725 source={OwnPublication},
1726 sourcetype={Conference},
1727 pdf={""},
1728 }
1729
1730 @INPROCEEDINGS{davies+plumbley+eck:waspaa2009,
1731 author = {Davies, M. and Plumbley, M. and Eck, Douglas},
1732 title = {Towards a musical beat emphasis function},
1733 booktitle = {Proceedings of IEEE WASPAA},
1734 year = {2009},
1735 organization = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
1736 source={OwnPublication},
1737 sourcetype={Conference},
1738 }
1739
1740 @INPROCEEDINGS{Delalleau+al-2005,
1741 author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
1742 editor = {Cowell, Robert G. and Ghahramani, Zoubin},
1743 title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
1744 booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)},
1745 year = {2005},
1746 pages = {96--103},
1747 publisher = {Society for Artificial Intelligence and Statistics},
1748 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/semisup_aistats2005.pdf},
1749 abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed nonparametric algorithms which provide an estimated continuous label for the given unlabeled examples. First, it extends them to function induction algorithms that minimize a regularization criterion applied to an out-of-sample example, and happen to have the form of Parzen windows regressors. This allows to predict test labels without solving again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Second, this function induction procedure gives rise to an efficient approximation of the training process, reducing the linear system to be solved to m << n unknowns, using only a subset of m examples. An improvement of O(n^2/m^2) in time can thus be obtained. Comparative experiments are presented, showing the good performance of the induction formula and approximation algorithm.},
1750 topics={Unsupervised},cat={C},
1751 }
1752
1753 @INCOLLECTION{Delalleau+al-ssl-2006,
1754 author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
1755 editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
1756 title = {Large-Scale Algorithms},
1757 booktitle = {Semi-Supervised Learning},
1758 year = {2006},
1759 pages = {333--341},
1760 publisher = {{MIT} Press},
1761 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/delalleau_ssl.pdf},
1762 abstract = {In Chapter 11, it is shown how a number of graph-based semi-supervised learning
1763 algorithms can be seen as the minimization of a specific cost function, leading to a
1764 linear system with n equations and unknowns (with n the total number of labeled
1765 and unlabeled examples). Solving such a linear system will in general require on the
1766 order of O(kn2) time and O(kn) memory (for a sparse graph where each data point
1767 has k neighbors), which can be prohibitive on large datasets (especially if k = n,
1768 i.e. the graph is dense). We present in this chapter a subset selection method that
1769 can be used to reduce the original system to one of size m << n. The idea is to solve
1770 for the labels of a subset S of X of only m points, while still retaining information
1771 from the rest of the data by approximating their label with a linear combination of
1772 the labels in S (using the induction formula presented in Chapter 11). This leads
1773 to an algorithm whose computational requirements scale as O(m2n) and memory
1774 requirements as O(m2), thus allowing one to take advantage of significantly bigger
1775 unlabeled datasets than with the original algorithms.},
1776 cat={B},topics={Unsupervised},
1777 }
1778
1779 @INCOLLECTION{DeMori90a,
1780 author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
1781 editor = {Mohr, R. and Pavlidis, T. and Sanfelin, A.},
1782 title = {On the use of an ear model and multi-layer networks for automatic speech recognition},
1783 booktitle = {Structural Pattern Analysis},
1784 year = {1990},
1785 publisher = {World Scientific},
1786 topics={PriorKnowledge,Speech},cat={B},
1787 }
1788
1789 @INPROCEEDINGS{Desjardins+al-2010,
1790 author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua},
1791 title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine},
1792 booktitle = {Proceedings of AISTATS 2010},
1793 volume = {9},
1794 year = {2010},
1795 pages = {145-152},
1796 abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.}
1797 }
1798
1799 @TECHREPORT{Desjardins-2008,
1800 author = {Desjardins, Guillaume and Bengio, Yoshua},
1801 keywords = {Convolutional Architectures, Deep Networks, RBM, Vision},
1802 title = {Empirical Evaluation of Convolutional RBMs for Vision},
1803 number = {1327},
1804 year = {2008},
1805 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
1806 abstract = {Convolutional Neural Networks ({CNN}) have had great success in machine learning tasks involving vision and represent one of the early successes of deep networks. Local receptive fields and weight
1807 sharing make their architecture ideally suited for vision tasks by helping to enforce a prior based on our knowledge of natural images. This same prior could also be applied to recent developments in the field of deep networks, in order to tailor these new architectures for artificial vision. In this context, we show how the Restricted {Boltzmann} Machine (RBM), the building block of Deep Belief Networks (DBN), can be adapted to operate in a convolutional manner. We compare their performance to standard fully-connected RBMs on a simple visual learning task and show that the convolutional RBMs (CRBMs) converge to smaller values of the negative likelihood function. Our experiments also indicate that CRBMs are more efficient than standard RBMs trained on small image patches, with the CRBMs having faster convergence.}
1808 }
1809
1810 @TECHREPORT{Desjardins-tech-2009,
1811 author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal and Delalleau, Olivier},
1812 keywords = {CD, PCD, RBM, simulated tempering, tempered MCMC, unsupervised learning},
1813 title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machines},
1814 number = {1345},
1815 year = {2009},
1816 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
1817 abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we
1818 explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood that it helps both sampling and learning.}
1819 }
1820
1821 @ARTICLE{Dugas+Bengio-2009,
1822 author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
1823 title = {Incorporating Functional Knowledge in Neural Networks},
1824 journal = {The Journal of Machine Learning Research},
1825 volume = {10},
1826 year = {2009},
1827 pages = {1239--1262},
1828 abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in its two arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of Lipschitz functions with these and other properties. We apply this new class of functions to the task of modelling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.}
1829 }
1830
1831 @PHDTHESIS{Dugas-Phd-2003,
1832 author = {Dugas, Charles},
1833 title = {Les algorithmes d'apprentissage appliqu{\'{e}}s aux risques financiers},
1834 year = {2003},
1835 school = {Universit{\'{e}} de Montr{\'{e}}al}
1836 }
1837
1838 @ARTICLE{dugas:2003,
1839 author = {Dugas, Charles and Bengio, Yoshua and Chapados, Nicolas and Vincent, Pascal and Denoncourt, Germain and Fournier, Christian},
1840 title = {Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking},
1841 journal = {CAS Forum},
1842 volume = {1},
1843 number = {1},
1844 year = {2003},
1845 pages = {179--214},
1846 abstract = {We recently conducted a research project for a large North American automobile insurer. This study was the most exhaustive ever undertaken by this particular insurer and lasted over an entire year. We analyzed the discriminating power of each variable used for ratemaking. We analyzed the performance of several models within five broad categories: linear regressions, generalized linear models, decision trees, neural networks and support vector machines. In this paper, we present the main results of this study. We qualitatively compare models and show how neural networks can represent high-order nonlinear dependencies with a small number of parameters, each of which is estimated on a large proportion of the data, thus yielding low variance. We thoroughly explain the purpose of the nonlinear sigmoidal transforms which are at the very heart of neural networks' performances. The main numerical result is a statistically significant reduction in the out-of-sample mean-squared error using the neural network model and our ability to substantially reduce the median premium by charging more to the highest risks. This in turn can translate into substantial savings and financial benefits for an insurer. We hope this paper goes a long way towards convincing actuaries to include neural networks within their set of modeling tools for ratemaking.},
1847 topics={Finance,Mining},cat={J},
1848 }
1849
1850 @INPROCEEDINGS{eck+bertinmahieux+lamere+green:nips2007,
1851 author = {Eck, Douglas and Lamere, Paul and Bertin-Mahieux, Thierry and Green, Stephen},
1852 editor = {Platt, John and Kolen, J. and Singer, Yoram and Roweis, S.},
1853 title = {Automatic Generation of Social Tags for Music Recommendation},
1854 year = {2008},
1855 crossref = {NIPS20-shorter},
1856 source = "OwnPublication"
1857 }
1858
1859 @INPROCEEDINGS{eck+bertinmahieux+lamere:ismir2007,
1860 author = {Eck, Douglas and Bertin-Mahieux, Thierry and Lamere, Paul},
1861 title = {Autotagging music using supervised machine learning},
1862 booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
1863 year = {2007},
1864 source={OwnPublication},
1865 }
1866
1867 @INPROCEEDINGS{eck+casagrande:ismir2005,
1868 author = {Eck, Douglas and Casagrande, Norman},
1869 title = {Finding Meter in Music Using an Autocorrelation Phase Matrix and Shannon Entropy},
1870 booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
1871 year = {2005},
1872 pages = {504--509},
1873 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir.pdf},
1874 source={OwnPublication},
1875 sourcetype={Conference},
1876 }
1877
1878 @INCOLLECTION{eck+gasser+port:2000,
1879 author = {Eck, Douglas and Gasser, M. and Port, Robert},
1880 editor = {Desain, P. and Windsor, L.},
1881 title = {Dynamics and Embodiment in Beat Induction},
1882 booktitle = {{Rhythm Perception and Production}},
1883 year = {2000},
1884 pages = {157--170},
1885 publisher = {Swets and Zeitlinger},
1886 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2000_rppw.pdf},
1887 abstract = {We provide an argument for using dynamical systems theory in the domain of beat induction. We motivate the study of beat induction and to relate beat induction to the more general study of human rhythm cognition. In doing so we compare a dynamical, embodied approach to a symbolic (traditional AI) one, paying particular attention to how the modeling approach brings with it tacit assumptions about what is being modeled. Please note that this is a philosophy paper about research that was, at the time of writing, very much in progress.},
1888 source={OwnPublication},
1889 sourcetype={Chapter},
1890 }
1891
1892 @INPROCEEDINGS{eck+gasser:1996,
1893 author = {Eck, Douglas and Gasser, M.},
1894 editor = {},
1895 title = {Perception of Simple Rhythmic Patterns in a Network of Oscillators},
1896 booktitle = {{The Proceedings of the Eighteenth Annual Conference of the Cognitive Science Society}},
1897 year = {1996},
1898 publisher = {Lawrence Erlbaum Associates},
1899 abstract = {This paper is concerned with the complex capacity to recognize and reproduce rhythmic patterns. While this capacity has not been well investigated, in broad qualitative terms it is clear that people can learn to identify and produce recurring patterns defined in terms of sequences of beats of varying intensity and rests: the rhythms behind waltzes, reels, sambas, etc. Our short term goal is a model which is "hard-wired" with knowledge of a set of such patterns. Presented with a portion of one of the patterns or a label for a pattern, the model should reproduce the pattern and continue to do so when the input is turned off. Our long-term goal is a model which can learn to adjust the connection strengths which implement particular patterns as it is exposed to input patterns.},
1900 source={OwnPublication},
1901 sourcetype={Conference},
1902 }
1903
1904 @TECHREPORT{eck+graves+schmidhuber:tr-speech2003,
1905 author = {Eck, Douglas and Graves, A. and Schmidhuber, Juergen},
1906 title = {A New Approach to Continuous Speech Recognition Using {LSTM} Recurrent Neural Networks},
1907 number = {IDSIA-14-03},
1908 year = {2003},
1909 institution = {IDSIA},
1910 abstract = {This paper presents an algorithm for continuous speech recognition built from two Long Short-Term Memory ({LSTM}) recurrent neural networks. A first {LSTM} network performs frame-level phone probability estimation. A second network maps these phone predictions onto words. In contrast to {HMM}s, this allows greater exploitation of long-timescale correlations. Simulation results are presented for a hand-segmented subset of the "Numbers-95" database. These results include isolated phone prediction, continuous frame-level phone prediction and continuous word prediction. We conclude that despite its early stage of development, our new model is already competitive with existing approaches on certain aspects of speech recognition and promising on others, warranting further research.},
1911 source={OwnPublication},
1912 sourcetype={TechReport},
1913 }
1914
1915 @TECHREPORT{eck+lapalme:2008,
1916 author = {Eck, Douglas and Lapalme, J.},
1917 title = {Learning Musical Structure Directly from Sequences of Music},
1918 number = {1300},
1919 year = {2008},
1920 institution = {Universit{\'{e}} de Montr{\'{e}}al DIRO},
1921 url = {http://www.iro.umontreal.ca/~eckdoug/papers/tr1300.pdf},
1922 source={OwnPublication},
1923 sourcetype={TechReport},
1924 }
1925
1926 @INPROCEEDINGS{eck+schmidhuber:icann2002,
1927 author = {Eck, Douglas and Schmidhuber, Juergen},
1928 editor = {Dorronsoro, J.},
1929 title = {Learning The Long-Term Structure of the Blues},
1930 booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
1931 volume = {},
1932 year = {2002},
1933 pages = {284--289},
1934 publisher = {Springer},
1935 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_icannMusic.pdf},
1936 abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, they have been unable to learn an entire musical form and use that knowledge to guide composition. In this study, we describe model details and present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and some listeners believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
1937 source={OwnPublication},
1938 sourcetype={Conference},
1939 }
1940
1941 @INPROCEEDINGS{eck+schmidhuber:ieee2002,
1942 author = {Eck, Douglas and Schmidhuber, Juergen},
1943 editor = {Bourlard, H.},
1944 title = {Finding Temporal Structure in Music: Blues Improvisation with {LSTM} Recurrent Networks},
1945 booktitle = {Neural Networks for Signal Processing XII, Proceedings of the 2002 IEEE Workshop},
1946 year = {2002},
1947 pages = {747--756},
1948 publisher = {IEEE},
1949 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_ieee.pdf},
1950 abstract = {Few types of signal streams are as ubiquitous as music. Here we consider the problem of extracting essential ingredients of music signals, such as well-defined global temporal structure in the form of nested periodicities (or {\em meter}). Can we construct an adaptive signal processing device that learns by example how to generate new instances of a given musical style? Because recurrent neural networks can in principle learn the temporal structure of a signal, they are good candidates for such a task. Unfortunately, music composed by standard recurrent neural networks ({RNN}s) often lacks global coherence. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and learning of context sensitive languages. In the current study we show that {LSTM} is also a good mechanism for learning to compose music. We present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and we believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
1951 source={OwnPublication},
1952 sourcetype={Conference},
1953 }
1954
1955 @ARTICLE{eck+scott:2005,
1956 author = {Eck, Douglas and Scott, S. K.},
1957 title = {Editorial: New Research in Rhythm Perception and Production},
1958 journal = {Music Perception},
1959 volume = {22},
1960 number = {3},
1961 year = {2005},
1962 pages = {371-388},
1963 source={OwnPublication},
1964 sourcetype={Other},
1965 }
1966
1967 @MISC{eck+scott:editor2005,
1968 author = {Eck, Douglas and Scott, S. K.},
1969 title = {Music Perception},
1970 year = {2005},
1971 note = {Guest Editor, Special Issue on Rhythm Perception and Production, 22(3)},
1972 source={OwnPublication},
1973 sourcetype={Other},
1974 }
1975
1976 @INPROCEEDINGS{eck:1999,
1977 author = {Eck, Douglas},
1978 editor = {},
1979 title = {Learning Simple Metrical Preferences in a Network of {F}itzhugh-{N}agumo Oscillators},
1980 booktitle = {{The Proceedings of the Twenty-First Annual Conference of the Cognitive Science Society}},
1981 year = {1999},
1982 publisher = {Lawrence Erlbaum Associates},
1983 abstract = {Hebbian learning is used to train a network of oscillators to prefer periodic signals of pulses over aperiodic signals. Target signals consisted of metronome-like voltage pulses with varying amounts of inter-onset noise injected. (with 0\% noise yielding a periodic signal and more noise yielding more and more aperiodic signals.) The oscillators---piecewise-linear approximations (Abbott, 1990) to Fitzhugh-Nagumo oscillators---are trained using mean phase coherence as an objective function. Before training a network is shown to readily synchronize with signals having wide range of noise. After training on a series of noise-free signals, a network is shown to only synchronize with signals having little or no noise. This represents a bias towards periodicity and is explained by strong positive coupling connections between oscillators having harmonically-related periods.},
1984 source={OwnPublication},
1985 sourcetype={Conference},
1986 }
1987
1988 @UNPUBLISHED{eck:bramsworkshop2004,
1989 author = {Eck, Douglas},
1990 title = {Challenges for Machine Learning in the Domain of Music},
1991 year = {2004},
1992 note = {BRAMS Workshop on Brain and Music, Montreal Neurological Institute},
1993 abstract = {Slides and musical examples available on request.},
1994 source={OwnPublication},
1995 sourcetype={Workshop},
1996 optkey={""},
1997 optmonth={""},
1998 optannote={""},
1999 }
2000
2001 @PHDTHESIS{eck:diss,
2002 author = {Eck, Douglas},
2003 title = {{Meter Through Synchrony: Processing Rhythmical Patterns with Relaxation Oscillators}},
2004 year = {2000},
2005 school = {Indiana University, Bloomington, IN, www.idsia.ch/\-\~{}doug/\-publications.html},
2006 abstract = {This dissertation uses a network of relaxation oscillators to beat along with temporal signals. Relaxation oscillators exhibit interspersed slow-fast movement and model a wide array of biological oscillations. The model is built up gradually: first a single relaxation oscillator is exposed to rhythms and shown to be good at finding downbeats in them. Then large networks of oscillators are mutually coupled in an exploration of their internal synchronization behavior. It is demonstrated that appropriate weights on coupling connections cause a network to form multiple pools of oscillators having stable phase relationships. This is a promising first step towards networks that can recreate a rhythmical pattern from memory. In the full model, a coupled network of relaxation oscillators is exposed to rhythmical patterns. It is shown that the network finds downbeats in patterns while continuing to exhibit good internal stability. A novel non-dynamical model of downbeat induction called the Normalized Positive (NP) clock model is proposed, analyzed, and used to generate comparison predictions for the oscillator model. The oscillator model compares favorably to other dynamical approaches to beat induction such as adaptive oscillators. However, the relaxation oscillator model takes advantage of intrinsic synchronization stability to allow the creation of large coupled networks. This research lays the groundwork for a long-term research goal, a robotic arm that responds to rhythmical signals by tapping along. It also opens the door to future work in connectionist learning of long rhythmical patterns.},
2007 source={OwnPublication},
2008 sourcetype={Thesis},
2009 }
2010
2011 @INPROCEEDINGS{eck:icann2001,
2012 author = {Eck, Douglas},
2013 editor = {Dorffner, Georg},
2014 title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
2015 booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
2016 volume = {},
2017 year = {2001},
2018 pages = {1239--1247},
2019 publisher = {Springer},
2020 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_icann.pdf},
2021 abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators.},
2022 source={OwnPublication},
2023 sourcetype={Conference},
2024 }
2025
2026 @INPROCEEDINGS{eck:icassp2007,
2027 author = {Eck, Douglas},
2028 editor = {},
2029 title = {Beat Tracking Using an Autocorrelation Phase Matrix},
2030 booktitle = {{Proceedings of the 2007 International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
2031 year = {2007},
2032 pages = {1313--1316},
2033 publisher = {IEEE Signal Processing Society},
2034 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_icassp.pdf},
2035 source={OwnPublication},
2036 sourcetype={Conference},
2037 }
2038
2039 @INPROCEEDINGS{eck:icmpc2004,
2040 author = {Eck, Douglas},
2041 editor = {Lipscomb, S. D. and Ashley, R. and Gjerdingen, R. O. and Webster, P.},
2042 title = {A Machine-Learning Approach to Musical Sequence Induction That Uses Autocorrelation to Bridge Long Timelags},
2043 booktitle = {{The Proceedings of the Eighth International Conference on Music Perception and Cognition ({ICMPC}8)}},
2044 year = {2004},
2045 pages = {542-543},
2046 publisher = {Causal Productions},
2047 abstract = {One major challenge in using statistical sequence learning methods in the domain of music lies in bridging the long timelags that separate important musical events. Consider, for example, the chord changes that convey the basic structure of a pop song. A sequence learner that cannot predict chord changes will almost certainly not be able to generate new examples in a musical style or to categorize songs by style. Yet, it is surprisingly difficult for a sequence learner to bridge the long timelags necessary to identify when a chord change will occur and what its new value will be. This is the case because chord changes can be separated by dozens or hundreds of intervening notes. One could solve this problem by treating chords as being special (as did Mozer, NIPS 1991). But this is impractical---it requires chords to be labeled specially in the dataset, limiting the applicability of the model to non-labeled examples---and furthermore does not address the general issue of nested temporal structure in music. I will briefly describe this temporal structure (known commonly as "meter") and present a model that uses to its advantage an assumption that sequences are metrical. The model consists of an autocorrelation-based filtration that estimates online the most likely metrical tree (i.e. the frequency and phase of beat, measure, phrase &etc.) and uses that to generate a series of sequences varying at different rates. These sequences correspond to each level in the hierarchy. Multiple learners can be used to treat each series separately and their predictions can be combined to perform composition and categorization. I will present preliminary results that demonstrate the usefulness of this approach. Time permitting I will also compare the model to alternate approaches.},
2048 source={OwnPublication},
2049 sourcetype={Conference},
2050 }
2051
2052 @INPROCEEDINGS{eck:icmpc2006,
2053 author = {Eck, Douglas},
2054 editor = {Baroni, M. and Addessi, A. R. and Caterina, R. and Costa, M.},
2055 title = {Beat Induction Using an Autocorrelation Phase Matrix},
2056 booktitle = {The Proceedings of the 9th International Conference on Music Perception and Cognition ({ICMPC9})},
2057 year = {2006},
2058 pages = {931-932},
2059 publisher = {Causal Productions},
2060 source={OwnPublication},
2061 sourcetype={Conference},
2062 }
2063
2064 @UNPUBLISHED{eck:irisworkshop2004,
2065 author = {Eck, Douglas},
2066 title = {Using Autocorrelation to Bridge Long Timelags when Learning Sequences of Music},
2067 year = {2004},
2068 note = {IRIS 2004 Machine Learning Workshop, Ottawa, Canada},
2069 abstract = {Slides and musical examples available on request.},
2070 source={OwnPublication},
2071 sourcetype={Workshop},
2072 optkey={""},
2073 optmonth={""},
2074 optannote={""},
2075 }
2076
2077 @ARTICLE{eck:jnmr2001,
2078 author = {Eck, Douglas},
2079 title = {A Positive-Evidence Model for Rhythmical Beat Induction},
2080 journal = {Journal of New Music Research},
2081 volume = {30},
2082 number = {2},
2083 year = {2001},
2084 pages = {187--200},
2085 abstract = {The Normalized Positive (NPOS) model is a rule-based model that predicts downbeat location and pattern complexity in rhythmical patterns. Though derived from several existing models, the NPOS model is particularly effective at making correct predictions while at the same time having low complexity. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on beat induction.},
2086 source={OwnPublication},
2087 sourcetype={Journal},
2088 }
2089
2090 @UNPUBLISHED{eck:mipsworkshop2004,
2091 author = {Eck, Douglas},
2092 title = {Bridging Long Timelags in Music},
2093 year = {2004},
2094 note = {NIPS 2004 Workshop on Music and Machine Learning (MIPS), Whistler, British Columbia},
2095 abstract = {Slides and musical examples available on request.},
2096 source={OwnPublication},
2097 sourcetype={Workshop},
2098 optkey={""},
2099 optmonth={""},
2100 optannote={""},
2101 }
2102
2103 @ARTICLE{eck:mp2006,
2104 author = {Eck, Douglas},
2105 title = {Finding Long-Timescale Musical Structure with an Autocorrelation Phase Matrix},
2106 journal = {Music Perception},
2107 volume = {24},
2108 number = {2},
2109 year = {2006},
2110 pages = {167--176},
2111 source={OwnPublication},
2112 sourcetype={Journal},
2113 }
2114
2115 @UNPUBLISHED{eck:nipsworkshop2003,
2116 author = {Eck, Douglas},
2117 title = {Time-warped hierarchical structure in music and speech: A sequence prediction challenge},
2118 year = {2003},
2119 note = {NIPS 2003 Workshop on Recurrent Neural Networks, Whistler, British Columbia},
2120 abstract = {Slides and musical examples available on request.},
2121 source={OwnPublication},
2122 sourcetype={Workshop},
2123 optkey={""},
2124 optmonth={""},
2125 optannote={""},
2126 }
2127
2128 @UNPUBLISHED{eck:nipsworkshop2006,
2129 author = {Eck, Douglas},
2130 title = {Generating music sequences with an echo state network},
2131 year = {2006},
2132 note = {NIPS 2006 Workshop on Echo State Networks and Liquid State Machines},
2133 abstract = {Slides and musical examples available on request.},
2134 source={OwnPublication},
2135 sourcetype={Workshop},
2136 optkey={""},
2137 optmonth={""},
2138 optannote={""},
2139 }
2140
2141 @UNPUBLISHED{eck:nipsworkshop2007,
2142 author = {Eck, Douglas},
2143 title = {Measuring and modeling musical expression},
2144 year = {2007},
2145 note = {NIPS 2007 Workshop on Music, Brain and Cognition},
2146 source={OwnPublication},
2147 sourcetype={Workshop},
2148 optkey={""},
2149 optmonth={""},
2150 optannote={""},
2151 }
2152
2153 @ARTICLE{eck:psyres2002,
2154 author = {Eck, Douglas},
2155 title = {Finding Downbeats with a Relaxation Oscillator},
2156 journal = {Psychol. Research},
2157 volume = {66},
2158 number = {1},
2159 year = {2002},
2160 pages = {18--25},
2161 abstract = {A relaxation oscillator model of neural spiking dynamics is applied to the task of finding downbeats in rhythmical patterns. The importance of downbeat discovery or {\em beat induction} is discussed, and the relaxation oscillator model is compared to other oscillator models. In a set of computer simulations the model is tested on 35 rhythmical patterns from Povel \& Essens (1985). The model performs well, making good predictions in 34 of 35 cases. In an analysis we identify some shortcomings of the model and relate model behavior to dynamical properties of relaxation oscillators.},
2162 source={OwnPublication},
2163 sourcetype={Journal},
2164 }
2165
2166 @UNPUBLISHED{eck:rppw2005,
2167 author = {Eck, Douglas},
2168 title = {Meter and Autocorrelation},
2169 year = {2005},
2170 note = {{10th Rhythm Perception and Production Workshop (RPPW), Alden Biesen, Belgium}},
2171 source={OwnPublication},
2172 sourcetype={Workshop},
2173 }
2174
2175 @TECHREPORT{eck:tr-music2002,
2176 author = {Eck, Douglas and Schmidhuber, Juergen},
2177 title = {A First Look at Music Composition using {LSTM} Recurrent Neural Networks},
2178 number = {IDSIA-07-02},
2179 year = {2002},
2180 institution = {IDSIA},
2181 abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, attempts at learning an entire musical form and using that knowledge to guide composition have been unsuccessful. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and CSL learning. In the current study I show that {LSTM} is also a good mechanism for learning to compose music. I compare this approach to previous attempts, with particular focus on issues of data representation. I present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and I believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen. {\em Note: This is a more complete version of the 2002 ICANN submission Learning the Long-Term Structure of the Blues.}},
2182 source={OwnPublication},
2183 sourcetype={TechReport},
2184 }
2185
2186 @TECHREPORT{eck:tr-npos2000,
2187 author = {Eck, Douglas},
2188 title = {A Positive-Evidence Model for Classifying Rhythmical Patterns},
2189 number = {IDSIA-09-00},
2190 year = {2000},
2191 institution = {IDSIA},
2192 abstract = {The Normalized Positive (NPOS) model is a novel matching model that predicts downbeat location and pattern complexity in rhythmical patterns. Though similar models report success, the NPOS model is particularly effective at making these predictions while at the same time being theoretically and mathematically simple. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on rhythm perception.\\ {\em Note: See the 2001 Journal of New Music Research paper "A Positive-Evidence Model for Rhythmical Beat Induction" for a newer version of this paper.}},
2193 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-09-00.ps.gz},
2194 source={OwnPublication},
2195 sourcetype={TechReport},
2196 }
2197
2198 @TECHREPORT{eck:tr-oscnet2001,
2199 author = {Eck, Douglas},
2200 title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
2201 number = {IDSIA-06-01},
2202 year = {2001},
2203 institution = {IDSIA},
2204 abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators. \\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
2205 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-06-01.ps.gz},
2206 source={OwnPublication},
2207 sourcetype={TechReport},
2208 }
2209
2210 @TECHREPORT{eck:tr-tracking2000,
2211 author = {Eck, Douglas},
2212 title = {Tracking Rhythms with a Relaxation Oscillator},
2213 number = {IDSIA-10-00},
2214 year = {2000},
2215 institution = {IDSIA},
2216 abstract = {A number of biological and mechanical processes are typified by a continued slow accrual and fast release of energy. A nonlinear oscillator exhibiting this slow-fast behavior is called a relaxation oscillator and is used to model, for example, human heartbeat pacemaking and neural action potential. Similar limit cycle oscillators are used to model a wider range of behaviors including predator-prey relationships and synchrony in animal populations such as fireflies. Though nonlinear limit-cycle oscillators have been successfully applied to beat induction, relaxation oscillators have received less attention. In this work we offer a novel and effective relaxation oscillator model of beat induction. We outline the model in detail and provide a perturbation analysis of its response to external stimuli. In a series of simulations we expose the model to patterns from Experiment 1 of Povel \& Essens (1985). We then examine the beat assignments of the model. Although the overall performance of the model is very good, there are shortcomings. We believe that a network of mutually-coupled oscillators will address many of these shortcomings, and we suggest an appropriate course for future research.\\ {\em Note: See the 2001 {\em Psychological Research} article "Finding Downbeats with a Relaxation Oscillator" for a revised but less detailed version of this paper.}},
2217 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-10-00.ps.gz},
2218 source={OwnPublication},
2219 sourcetype={TechReport},
2220 }
2221
2222 @TECHREPORT{eck:tr-tracking2002,
2223 author = {Eck, Douglas},
2224 title = {Real-Time Musical Beat Induction with Spiking Neural Networks},
2225 number = {IDSIA-22-02},
2226 year = {2002},
2227 institution = {IDSIA},
2228 abstract = {Beat induction is best described by analogy to the activities of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to perform automatically. We will present a model of beat induction that uses a spiking neural network as the underlying synchronization mechanism. This approach has some advantages over existing methods; it runs online, responds at many levels in the metrical hierarchy, and produces good results on performed music (Beatles piano performances encoded as MIDI). In this paper the model is described in some detail and simulation results are discussed.},
2229 source={OwnPublication},
2230 sourcetype={TechReport},
2231 }
2232
2233 @UNPUBLISHED{eck:verita2002,
2234 author = {Eck, Douglas},
2235 title = {Real Time Beat Induction with Spiking Neurons},
2236 year = {2002},
2237 note = {{Music, Motor Control and the Mind: Symposium at Monte Verita, May}},
2238 abstract = {Beat induction is best described by analogy to the activites of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to model. I will discuss an approach to beat induction that uses a network of spiking neurons to synchronize with periodic components in a signal at many timescales. Through a competitive process, groups of oscillators embodying a particular metrical interpretation (e.g. \"4/4\") are selected from the network and used to track the pattern. I will compare this model to other approaches including a traditional symbolic AI system (Dixon 2001), and one based on Bayesian statistics (Cemgil et al, 2001). Finally I will present performance results of the network on a set of MIDI-recorded piano performances of Beatles songs collected by the Music, Mind, Machine Group, NICI, University of Nijmegen (see Cemgil et al, 2001 for more details or http://www.nici.kun.nl/mmm).},
2239 source={OwnPublication},
2240 sourcetype={Workshop},
2241 }
2242
2243 @INPROCEEDINGS{ElHihi+Bengio-nips8,
2244 author = {El Hihi, Salah and Bengio, Yoshua},
2245 title = {Hierarchical Recurrent Neural Networks for Long-Term Dependencies},
2246 year = {1996},
2247 crossref = {NIPS8-shorter},
2248 abstract = {We have already shown that extracting lone-term dependencies from sequential data is difficult, both for deterministic dynamical systems such as recurrent networks, and probabilistic models such as hidden {Markov} models ({HMM}s) or input/output hidden {Markov} models ({IOHMM}s). In practice, to avoid this problem, researchers have used domain specific a-priori knowledge to give meaning to the hidden or state variables representing past context. In this paper we propose to use a more general type of a-priori knowledge, namely that the temporal dependencies are structured hierarchically. This implies that long-term dependencies are represented by variables with a long time scale. This principle is applied to a recurrent network which includes delays and multiple time scales. Experiments confirm the advantages of such structures. A similar approach is proposed for {HMM}s and {IOHMM}s.},
2249 topics={LongTerm},cat={C},
2250 }
2251
2252 @ARTICLE{Erhan+al-2010,
2253 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
2254 title = {Why Does Unsupervised Pre-training Help Deep Learning?},
2255 volume = {11},
2256 year = {2010},
2257 pages = {625--660},
2258 crossref = {JMLR-shorter},
2259 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
2260 }
2261
2262 @INPROCEEDINGS{Erhan-aistats-2010,
2263 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
2264 title = {Why Does Unsupervised Pre-training Help Deep Learning?},
2265 booktitle = {Proceedings of AISTATS 2010},
2266 volume = {9},
2267 year = {2010},
2268 pages = {201-208},
2269 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants with impressive results being obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks often involve an unsupervised learning component, usually in an unsupervised pre-training phase. The main question investigated here is the following: why does unsupervised pre-training work so well? Through extensive experimentation, we explore several possible explanations discussed in the literature including its action as a regularizer (Erhan et al. 2009) and as an aid to optimization (Bengio et al. 2007). Our results build on the work of Erhan et al. 2009, showing that unsupervised pre-training appears to play predominantly a regularization role in subsequent supervised training. However our results in an online setting, with a virtually unlimited data stream, point to a somewhat more nuanced interpretation of the roles of optimization and regularization in the unsupervised pre-training effect.}
2270 }
2271
2272 @MASTERSTHESIS{Erhan-MSc,
2273 author = {Erhan, Dumitru},
2274 keywords = {Apprentisage multit{\^{a}}che, Filtrage collaboratif, M{\'{e}}thodes {\`{a}} noyaux, QSAR, R{\'{e}}seaux de neurones},
2275 title = {Collaborative filtering techniques for drug discovery},
2276 year = {2006},
2277 school = {Universit{\'{e}} de Montr{\'{e}}al},
2278 abstract = {Cette th{\`{e}}se examine le probl{\`{e}}me d'apprendre plusieurs t{\^{a}}ches simultan{\'{e}}ment,
2279 afin de transf{\'{e}}rer les connaissances apprises {\`{a}} une nouvelle t{\^{a}}che. Si
2280 on suppose que les t{\^{a}}ches partagent une repr{\'{e}}sentation et qu'il est possible de
2281 d{\'{e}}couvrir cette repr{\'{e}}sentation efficacement, cela peut nous servir {\`{a}} construire un
2282 meilleur mod{\`{e}}le de la nouvelle t{\^{a}}che. Il existe plusieurs variantes de
2283 cette m{\'{e}}thode: transfert inductif, apprentisage multit{\^{a}}che, filtrage
2284 collaboratif etc. Nous avons {\'{e}}valu{\'{e}} plusieurs algorithmes d'apprentisage
2285 supervis{\'{e}} pour d{\'{e}}couvrir des repr{\'{e}}sentations partag{\'{e}}es parmi les
2286 t{\^{a}}ches d{\'{e}}finies dans un probl{\`{e}}me de chimie computationelle. Nous avons
2287 formul{\'{e}} le probl{\`{e}}me dans un cadre d'apprentisage automatique,
2288 fait l'analogie avec les algorithmes standards de filtrage collaboratif et construit les
2289 hypoth{\`{e}}ses g{\'{e}}n{\'{e}}rales qui devraient {\^{e}}tre test{\'{e}}es pour valider l'utilitisation des
2290 algorithmes multit{\^{a}}che. Nous avons aussi {\'{e}}valu{\'{e}} la performance des algorithmes
2291 d'apprentisage utilis{\'{e}}s et d{\'{e}}montrons qu'il est, en effet, possible de trouver une
2292 repr{\'{e}}sentation partag{\'{e}}e pour le probl{\`{e}}me consider{\'{e}}. Du point de vue
2293 th{\'{e}}orique, notre apport est une modification d'un algorithme
2294 standard---les machines {\`{a}} vecteurs de support--qui produit des r{\'{e}}sultats
2295 comparables aux meilleurs algorithmes disponsibles et qui utilise {\`{a}} fond les
2296 concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre
2297 apport est l'utilisation de notre algorithme par les compagnies
2298 pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.}
2299 }
2300
2301 @INPROCEEDINGS{Erhan2009,
2302 author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
2303 keywords = {Deep Networks},
2304 title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training},
2305 year = {2009},
2306 pages = {153--160},
2307 crossref = {xAISTATS2009-shorter},
2308 abstract = {Whereas theoretical work suggests that deep architectures might be more efficient at representing highly-varying functions, training deep architectures was unsuccessful until the recent advent of algorithms based on unsupervised pretraining. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. Answering these questions is important if learning in deep architectures is to be further improved. We attempt to shed some light on these questions through extensive simulations. The experiments confirm and clarify the advantage of unsupervised pre-training. They demonstrate the robustness of the training procedure with respect to the random initialization, the positive effect of pre-training in terms of optimization and its role as a regularizer. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples.}
2309 }
2310
2311 @ARTICLE{gasser+eck+port:1999,
2312 author = {Gasser, M. and Eck, Douglas and Port, Robert},
2313 title = {Meter as Mechanism: A Neural Network Model that Learns Metrical patterns},
2314 journal = {Connection Science},
2315 volume = {11},
2316 number = {2},
2317 year = {1999},
2318 pages = {187--216},
2319 abstract = {One kind of prosodic structure that apparently underlies both music and some examples of speech production is meter. Yet detailed measurements of the timing of both music and speech show that the nested periodicities that define metrical structure can be quite noisy in time. What kind of system could produce or perceive such variable metrical timing patterns? And what would it take to be able to store and reproduce particular metrical patterns from long-term memory? We have developed a network of coupled oscillators that both produces and perceives patterns of pulses that conform to particular meters. In addition, beginning with an initial state with no biases, it can learn to prefer the particular meter that it has been previously exposed to.},
2320 own={Have},
2321 source={OwnPublication},
2322 sourcetype={Journal},
2323 }
2324
2325 @TECHREPORT{gasser+eck+port:tr-1996,
2326 author = {Gasser, M. and Eck, Douglas and Port, Robert},
2327 title = {Meter as Mechanism A Neural Network that Learns Metrical Patterns},
2328 number = {180},
2329 year = {1996},
2330 institution = {Indiana University Cognitive Science Program},
2331 source={OwnPublication},
2332 sourcetype={TechReport},
2333 }
2334
2335 @INPROCEEDINGS{gasser+eck:1996,
2336 author = {Gasser, M. and Eck, Douglas},
2337 editor = {},
2338 title = {Representing Rhythmic Patterns in a Network of Oscillators},
2339 booktitle = {{The Proceedings of the International Conference on Music Perception and Cognition}},
2340 number = {4},
2341 year = {1996},
2342 pages = {361--366},
2343 publisher = {Lawrence Erlbaum Associates},
2344 url = {http://www.iro.umontreal.ca/~eckdoug/papers/1996_gasser_icmpc.pdf},
2345 abstract = {This paper describes an evolving computational model of the perception and pro-duction of simple rhythmic patterns. The model consists of a network of oscillators of different resting frequencies which couple with input patterns and with each other. Os-cillators whose frequencies match periodicities in the input tend to become activated. Metrical structure is represented explicitly in the network in the form of clusters of os-cillators whose frequencies and phase angles are constrained to maintain the harmonic relationships that characterize meter. Rests in rhythmic patterns are represented by ex-plicit rest oscillators in the network, which become activated when an expected beat in the pattern fails to appear. The model makes predictions about the relative difficulty of patterns and the effect of deviations from periodicity in the input.},
2346 source={OwnPublication},
2347 sourcetype={Conference},
2348 }
2349
2350 @INPROCEEDINGS{gers+eck+schmidhuber:icann2001,
2351 author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
2352 editor = {Dorffner, Georg},
2353 title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
2354 booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
2355 year = {2001},
2356 pages = {669--676},
2357 publisher = {Springer},
2358 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_gers_icann.pdf},
2359 abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.},
2360 source={OwnPublication},
2361 sourcetype={Conference},
2362 }
2363
2364 @TECHREPORT{gers+eck+schmidhuber:tr-2000,
2365 author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
2366 title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
2367 number = {IDSIA-22-00},
2368 year = {2000},
2369 institution = {IDSIA},
2370 abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.\\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
2371 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-22-00.ps.gz},
2372 source={OwnPublication},
2373 sourcetype={TechReport},
2374 }
2375
2376 @INPROCEEDINGS{gers+perez+eck+schmidhuber:esann2002,
2377 author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
2378 title = {{DEKF-LSTM}},
2379 booktitle = {Proceedings of the 10th European Symposium on Artificial Neural Networks, ESANN 2002},
2380 year = {2002},
2381 source={OwnPublication},
2382 sourcetype={Conference},
2383 }
2384
2385 @INPROCEEDINGS{gers+perez+eck+schmidhuber:icannA2002,
2386 author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
2387 editor = {Dorronsoro, J.},
2388 title = {Learning Context Sensitive Languages with {LSTM} Trained with {Kalman} Filters},
2389 booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
2390 year = {2002},
2391 pages = {655--660},
2392 publisher = {Springer},
2393 abstract = {Unlike traditional recurrent neural networks, the Long Short-Term Memory ({LSTM}) model generalizes well when presented with training sequences derived from regular and also simple nonregular languages. Our novel combination of {LSTM} and the decoupled extended Kalman filter, however, learns even faster and generalizes even better, requiring only the 10 shortest exemplars n <= 10 of the context sensitive language a^nb^nc^n to deal correctly with values of n up to 1000 and more. Even when we consider the relatively high update complexity per timestep, in many cases the hybrid offers faster learning than {LSTM} by itself.},
2394 source={OwnPublication},
2395 sourcetype={Conference},
2396 }
2397
2398 @PHDTHESIS{Ghosn-Phd-2003,
2399 author = {Ghosn, Joumana},
2400 title = {Apprentissage multi-t{\^{a}}ches et partage de connaissances},
2401 year = {2003},
2402 school = {Universit{\'{e}} de Montr{\'{e}}al}
2403 }
2404
2405 @INPROCEEDINGS{ghosn97,
2406 author = {Ghosn, Joumana and Bengio, Yoshua},
2407 title = {Multi-Task Learning for Stock Selection},
2408 year = {1997},
2409 pages = {946--952},
2410 publisher = {MIT Press, Cambridge, MA},
2411 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/multitask-nips97.pdf},
2412 crossref = {NIPS9},
2413 abstract = {Artificial Neural Networks can be used to predict future returns of stocks in order to take financial decisions. Should one build a separate network for each stock or share the same network for all the stocks. In this paper we also explore other alternatives, in which some layers are shared and others are not shared. When the prediction of future returns for different stocks are viewed as different tasks, sharing some parameters across stocks is a form of multi-task learning. In a series of experiments with Canadian stocks, we obtain yearly returns that are more than 14\% above various benchmarks.},
2414 topics={MultiTask,Finance},cat={C},
2415 }
2416
2417 @TECHREPORT{Gingras-asynchronous-TR96,
2418 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
2419 title = {Handling asynchronous or missing financial data with recurrent networks},
2420 number = {1020},
2421 year = {1996},
2422 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
2423 topics={Finance,Missing},cat={T},
2424 }
2425
2426 @TECHREPORT{Gingras-financial-TR99,
2427 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
2428 title = {On Out-of-Sample Statistics for Financial Time-Series},
2429 number = {2585},
2430 year = {1999},
2431 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
2432 topics={Comparative,Finance},cat={T},
2433 }
2434
2435 @INPROCEEDINGS{gingras2000,
2436 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
2437 title = {On Out-of-Sample Statistics for Time-Series},
2438 booktitle = {Computational Finance 2000},
2439 year = {2000},
2440 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/out-err-cf2000.pdf},
2441 abstract = {This paper studies an out-of-sample statistic for time-series prediction that is analogous to the widely used R2 in-sample statistic. We propose and study methods to estimate the variance of this out-of-sample statistic. We suggest that the out-of-sample statistic is more robust to distributional and asymptotic assumptions behind many tests for in-sample statistics. Furthermore we argue that it may be more important in some cases to choose a model that generalizes as well as possible rather than choose the parameters that are closest to the true parameters. Comparative experiments are performed on a financial time-series (daily and monthly returns of the TSE300 index). The experiments are performed or varying prediction horizons and we study the relation between predictibility (out-of-sample R2), variability of the out-of-sample R2 statistic, and the prediction horizon.},
2442 topics={Comparative,Finance},cat={C},
2443 }
2444
2445 @INPROCEEDINGS{GlorotAISTATS2010,
2446 author = {Bengio, Yoshua and Glorot, Xavier},
2447 title = {Understanding the difficulty of training deep feedforward neural networks},
2448 booktitle = {Proceedings of AISTATS 2010},
2449 volume = {9},
2450 year = {2010},
2451 pages = {249-256},
2452 abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}
2453 }
2454
2455 @INPROCEEDINGS{Gori89,
2456 author = {Gori, Marco and Bengio, Yoshua and De Mori, Renato},
2457 title = {BPS: a learning algorithm for capturing the dynamic nature of speech},
2458 booktitle = {International Joint Conference on Neural Networks},
2459 volume = {2},
2460 year = {1989},
2461 pages = {417--424},
2462 publisher = {IEEE, New York},
2463 topics={Speech},cat={C},
2464 }
2465
2466 @INCOLLECTION{Grandvalet+Bengio-ssl-2006,
2467 author = {Grandvalet, Yves and Bengio, Yoshua},
2468 editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
2469 title = {Entropy Regularization},
2470 booktitle = {Semi-Supervised Learning},
2471 year = {2006},
2472 pages = {151--168},
2473 publisher = {{MIT} Press},
2474 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/entropy_regularization_2006.pdf},
2475 abstract = {The problem of semi-supervised induction consists in learning a decision rule from
2476 labeled and unlabeled data. This task can be undertaken by discriminative methods,
2477 provided that learning criteria are adapted consequently. In this chapter, we motivate the use of entropy regularization as a means to benefit from unlabeled data in
2478 the framework of maximum a posteriori estimation. The learning criterion is derived
2479 from clearly stated assumptions and can be applied to any smoothly parametrized
2480 model of posterior probabilities. The regularization scheme favors low density separation, without any modeling of the density of input features. The contribution
2481 of unlabeled data to the learning criterion induces local optima, but this problem
2482 can be alleviated by deterministic annealing. For well-behaved models of posterior
2483 probabilities, deterministic annealing {EM} provides a decomposition of the learning
2484 problem in a series of concave subproblems. Other approaches to the semi-supervised
2485 problem are shown to be close relatives or limiting cases of entropy regularization.
2486 A series of experiments illustrates the good behavior of the algorithm in terms of
2487 performance and robustness with respect to the violation of the postulated low density separation assumption. The minimum entropy solution benefits from unlabeled
2488 data and is able to challenge mixture models and manifold learning in a number of
2489 situations.},
2490 cat={B},topics={Unsupervised},
2491 }
2492
2493 @INPROCEEDINGS{graves+eck+schmidhuber:bio-adit2004,
2494 author = {Graves, A. and Eck, Douglas and Beringer, N. and Schmidhuber, Juergen},
2495 title = {Biologically Plausible Speech Recognition with {LSTM} Neural Nets},
2496 booktitle = {Proceedings of the First Int'l Workshop on Biologically Inspired Approaches to Advanced Information Technology (Bio-ADIT)},
2497 year = {2004},
2498 pages = {127-136},
2499 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2004_bioadit.pdf},
2500 abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) are local in space and time and closely related to a biological model of memory in the prefrontal cortex. Not only are they more biologically plausible than previous artificial {RNN}s, they also outperformed them on many artificially generated sequential processing tasks. This encouraged us to apply {LSTM} to more realistic problems, such as the recognition of spoken digits. Without any modification of the underlying algorithm, we achieved results comparable to state-of-the-art Hidden {Markov} Model ({HMM}) based recognisers on both the {TIDIGITS} and TI46 speech corpora. We conclude that {LSTM} should be further investigated as a biologically plausible basis for a bottom-up, neural net-based approach to speech recognition.},
2501 source={OwnPublication},
2502 sourcetype={Conference},
2503 }
2504
2505 @TECHREPORT{graves+eck+schmidhuber:tr-digits2003,
2506 author = {Graves, A. and Eck, Douglas and Schmidhuber, Juergen},
2507 title = {Comparing {LSTM} Recurrent Networks and Spiking Recurrent Networks on the Recognition of Spoken Digits},
2508 number = {IDSIA-13-03},
2509 year = {2003},
2510 institution = {IDSIA},
2511 abstract = {One advantage of spiking recurrent neural networks ({SNN}s) is an ability to categorise data using a synchrony-based latching mechnanism. This is particularly useful in problems where timewarping is encountered, such as speech recognition. Differentiable recurrent neural networks ({RNN}s) by contrast fail at tasks involving difficult timewarping, despite having sequence learning capabilities superior to {SNN}s. In this paper we demonstrate that Long Short-Term Memory ({LSTM}) is an {RNN} capable of robustly categorizing timewarped speech data, thus combining the most useful features of both paradigms. We compare its performance to {SNN}s on two variants of a spoken digit identification task, using data from an international competition. The first task (described in Nature (Nadis 2003)) required the categorisation of spoken digits with only a single training exemplar, and was specifically designed to test robustness to timewarping. Here {LSTM} performed better than all the {SNN}s in the competition. The second task was to predict spoken digits using a larger training set. Here {LSTM} greatly outperformed an {SNN}-like model found in the literature. These results suggest that {LSTM} has a place in domains that require the learning of large timewarped datasets, such as automatic speech recognition.},
2512 source={OwnPublication},
2513 sourcetype={TechReport},
2514 }
2515
2516 @INPROCEEDINGS{haffner-98,
2517 author = {Haffner, Patrick and Bottou, {L{\'{e}}on} and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
2518 title = {Browsing through High Quality Document Images with {DjVu}},
2519 booktitle = {Proc. of Advances in Digital Libraries 98},
2520 year = {1998},
2521 pages = {309--318},
2522 publisher = {IEEE},
2523 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/haffner-98.ps.gz},
2524 topics={HighDimensional},cat={C},
2525 }
2526
2527 @INPROCEEDINGS{Hamel+al-2009,
2528 author = {Hamel, Philippe and Wood, Sean and Eck, Douglas},
2529 title = {Automatic Identification of Instrument Classes in Polyphonic and Poly-Instrument Audio},
2530 booktitle = {10th International Society for Music Information Retrieval Conference},
2531 year = {2009},
2532 pages = {399--404},
2533 url = {http://ismir2009.ismir.net/proceedings/PS3-2.pdf},
2534 abstract = {We present and compare several models for automatic identification of instrument classes in polyphonic and poly-instrument audio. The goal is to be able to identify which categories of instrument (Strings, Woodwind, Guitar, Piano, etc.) are present in a given audio example. We use a machine learning approach to solve this task. We constructed a system to generate a large database of musically relevant poly-instrument audio. Our database is generated from hundreds of instruments classified in 7 categories. Musical audio examples are generated by mixing multi-track MIDI files with thousands of instrument combinations. We compare three different classifiers : a Support Vector Machine ({SVM}), a Multilayer Perceptron (MLP) and a Deep Belief Network (DBN). We show that the DBN tends to outperform both the {SVM} and the MLP in most cases.}
2535 }
2536
2537 @MISC{Hugo+al-snowbird-2007,
2538 author = {Larochelle, Hugo and Bengio, Yoshua and Erhan, Dumitru},
2539 title = {Generalization to a zero-data task: an empirical study},
2540 year = {2007},
2541 howpublished = {Talk and poster presented at the Learning Workshop(Snowbird), San Juan, Puerto Rico, 2007}
2542 }
2543
2544 @INPROCEEDINGS{hyper:2000:ijcnn,
2545 author = {Bengio, Yoshua},
2546 title = {Continuous Optimization of Hyper-Parameters},
2547 booktitle = {International Joint Conference on Neural Networks 2000},
2548 volume = {I},
2549 year = {2000},
2550 pages = {305--310},
2551 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyper-ijcnn2000.pdf},
2552 abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
2553 topics={ModelSelection},cat={C},
2554 }
2555
2556 @INPROCEEDINGS{ICML01,
2557 editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
2558 title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
2559 booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
2560 year = {-1},
2561 publisher = {Morgan Kaufmann}
2562 }
2563
2564 @INPROCEEDINGS{ICML01-short,
2565 editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
2566 title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
2567 booktitle = {ICML'01},
2568 year = {-1},
2569 publisher = {Morgan Kaufmann}
2570 }
2571
2572
2573 @INPROCEEDINGS{ICML02,
2574 editor = {Sammut, Claude and Hoffmann, Achim G.},
2575 title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
2576 booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
2577 year = {-1},
2578 publisher = {Morgan Kaufmann}
2579 }
2580
2581 @INPROCEEDINGS{ICML02-short,
2582 editor = {Sammut, Claude and Hoffmann, Achim G.},
2583 title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
2584 booktitle = {ICML'02},
2585 year = {-1},
2586 publisher = {Morgan Kaufmann}
2587 }
2588
2589
2590 @INPROCEEDINGS{ICML03,
2591 editor = {Fawcett, Tom and Mishra, Nina},
2592 title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
2593 booktitle = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
2594 year = {-1},
2595 publisher = {AAAI Press}
2596 }
2597
2598 @INPROCEEDINGS{ICML03-short,
2599 editor = {Fawcett, Tom and Mishra, Nina},
2600 title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
2601 booktitle = {ICML'03},
2602 year = {-1},
2603 publisher = {AAAI Press}
2604 }
2605
2606
2607 @INPROCEEDINGS{ICML04,
2608 editor = {Brodley, Carla E.},
2609 title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
2610 booktitle = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
2611 year = {-1},
2612 publisher = {ACM}
2613 }
2614
2615 @INPROCEEDINGS{ICML04-short,
2616 editor = {Brodley, Carla E.},
2617 title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
2618 booktitle = {ICML'04},
2619 year = {-1},
2620 publisher = {ACM}
2621 }
2622
2623
2624 @INPROCEEDINGS{ICML05-short,
2625 editor = {Raedt, Luc De and Wrobel, Stefan},
2626 title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
2627 booktitle = {ICML'05},
2628 year = {-1},
2629 publisher = {ACM}
2630 }
2631
2632
2633 @INPROCEEDINGS{ICML06-short,
2634 editor = {Cohen, William W. and Moore, Andrew},
2635 title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
2636 booktitle = {ICML'06},
2637 year = {-1},
2638 publisher = {ACM}
2639 }
2640
2641
2642 @INPROCEEDINGS{ICML07-short,
2643 editor = {Ghahramani, Zoubin},
2644 title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
2645 booktitle = {ICML'07},
2646 year = {-1},
2647 publisher = {ACM}
2648 }
2649
2650
2651 @INPROCEEDINGS{ICML08-short,
2652 editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
2653 title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
2654 booktitle = {ICML'08},
2655 year = {-1},
2656 publisher = {ACM}
2657 }
2658
2659
2660 @INPROCEEDINGS{ICML09-short,
2661 editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
2662 title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
2663 booktitle = {ICML'09},
2664 year = {-1},
2665 publisher = {ACM}
2666 }
2667
2668
2669 @INPROCEEDINGS{ICML96,
2670 editor = {Saitta, L.},
2671 title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
2672 booktitle = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
2673 year = {-1},
2674 publisher = {Morgan Kaufmann}
2675 }
2676
2677 @INPROCEEDINGS{ICML96-short,
2678 editor = {Saitta, L.},
2679 title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
2680 booktitle = {ICML'96},
2681 year = {-1},
2682 publisher = {Morgan Kaufmann}
2683 }
2684
2685
2686 @INPROCEEDINGS{ICML97,
2687 editor = {Fisher, Douglas H.},
2688 title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
2689 booktitle = {Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
2690 year = {-1},
2691 publisher = {Morgan Kaufmann}
2692 }
2693
2694 @INPROCEEDINGS{ICML97-short,
2695 editor = {Fisher, Douglas H.},
2696 title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
2697 booktitle = {ICML'97},
2698 year = {-1},
2699 publisher = {Morgan Kaufmann}
2700 }
2701
2702
2703 @INPROCEEDINGS{ICML98,
2704 editor = {Shavlik, Jude W.},
2705 title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
2706 booktitle = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
2707 year = {-1},
2708 publisher = {Morgan Kaufmann}
2709 }
2710
2711 @INPROCEEDINGS{ICML98-short,
2712 editor = {Shavlik, Jude W.},
2713 title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
2714 booktitle = {ICML'98},
2715 year = {-1},
2716 publisher = {Morgan Kaufmann}
2717 }
2718
2719
2720 @INPROCEEDINGS{ICML99,
2721 editor = {Bratko, Ivan and Dzeroski, Saso},
2722 title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
2723 booktitle = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
2724 year = {-1},
2725 publisher = {Morgan Kaufmann}
2726 }
2727
2728 @INPROCEEDINGS{ICML99-short,
2729 editor = {Bratko, Ivan and Dzeroski, Saso},
2730 title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
2731 booktitle = {ICML'99},
2732 year = {-1},
2733 publisher = {Morgan Kaufmann}
2734 }
2735
2736
2737 @INCOLLECTION{jaeger+eck:2007,
2738 author = {Jaeger, H. and Eck, Douglas},
2739 title = {Can't get you out of my head: {A} connectionist model of cyclic rehearsal},
2740 booktitle = {Modeling Communications with Robots and Virtual Humans},
2741 series = {{LNCS}},
2742 year = {2007},
2743 publisher = {Springer-Verlag},
2744 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_jaeger_eck.pdf},
2745 source={OwnPublication},
2746 sourcetype={Chapter},
2747 }
2748
2749 @MISC{James+al-snowbird-2008,
2750 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
2751 title = {Image Classification using Higher-Order Neural Models},
2752 year = {2008},
2753 howpublished = {The Learning Workshop (Snowbird, Utah)},
2754 url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
2755 }
2756
2757 @ARTICLE{JMLR-short,
2758 journal = {JMLR},
2759 year = {-1}
2760 }
2761
2762
2763 @INPROCEEDINGS{Kegl+Bertin+Eck-2008,
2764 author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
2765 title = {Metropolis-Hastings Sampling in a FilterBoost Music Classifier},
2766 booktitle = {Music and machine learning workshop (ICML08)},
2767 year = {2008}
2768 }
2769
2770 @INPROCEEDINGS{kegl2005b,
2771 author = {K{\'{e}}gl, Bal{\'{a}}zs},
2772 title = {Generalization Error and Algorithmic Convergence of Median Boosting.},
2773 year = {2005},
2774 crossref = {NIPS17-shorter},
2775 abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.}
2776 }
2777
2778 @ARTICLE{lacoste+eck:eurasip,
2779 author = {Lacoste, Alexandre and Eck, Douglas},
2780 title = {A Supervised Classification Algorithm For Note Onset Detection},
2781 journal = {EURASIP Journal on Applied Signal Processing},
2782 volume = {2007},
2783 number = {ID 43745},
2784 year = {2007},
2785 pages = {1--13},
2786 source={OwnPublication},
2787 sourcetype={Journal},
2788 }
2789
2790 @MASTERSTHESIS{Lajoie2009,
2791 author = {Lajoie, Isabelle},
2792 keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, auto-encodeur d{\'{e}}bruiteur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
2793 title = {Apprentissage de repr{\'{e}}sentations sur-compl{\`{e}}tes par entra{\^{\i}}nement d’auto-encodeurs},
2794 year = {2009},
2795 school = {Universit{\'{e}} de Montr{\'{e}}al},
2796 abstract = {Les avanc{\'{e}}s dans le domaine de l’intelligence artificielle, permettent {\`{a}} des syst{\`{e}}mes
2797 informatiques de r{\'{e}}soudre des t{\^{a}}ches de plus en plus complexes li{\'{e}}es par exemple {\`{a}}
2798 la vision, {\`{a}} la compr{\'{e}}hension de signaux sonores ou au traitement de la langue. Parmi
2799 les mod{\`{e}}les existants, on retrouve les R{\'{e}}seaux de Neurones Artificiels (RNA), dont la
2800 popularit{\'{e}} a fait un grand bond en avant avec la d{\'{e}}couverte de Hinton et al. [22], soit
2801 l’utilisation de Machines de {Boltzmann} Restreintes (RBM) pour un pr{\'{e}}-entra{\^{\i}}nement
2802 non-supervis{\'{e}} couche apr{\`{e}}s couche, facilitant grandement l’entra{\^{\i}}nement supervis{\'{e}} du
2803 r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es (DBN), entra{\^{\i}}nement qui s’av{\'{e}}rait jusqu’alors tr{\`{e}}s
2804 difficile {\`{a}} r{\'{e}}ussir. Depuis cette d{\'{e}}couverte, des chercheurs ont {\'{e}}tudi{\'{e}} l’efficacit{\'{e}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement, telles que l’empilement d’auto-encodeurs traditionnels (SAE) [5, 38], et l’empilement d’auto-encodeur d{\'{e}}bruiteur (SDAE) [44].
2805 C’est dans ce contexte qu’a d{\'{e}}but{\'{e}} la pr{\'{e}}sente {\'{e}}tude. Apr{\`{e}}s un bref passage en revue des notions de base du domaine de l’apprentissage machine et des m{\'{e}}thodes de
2806 pr{\'{e}}-entra{\^{\i}}nement employ{\'{e}}es jusqu’{\`{a}} pr{\'{e}}sent avec les modules RBM, AE et DAE, nous
2807 avons approfondi notre compr{\'{e}}hension du pr{\'{e}}-entra{\^{\i}}nement de type SDAE, explor{\'{e}} ses
2808 diff{\'{e}}rentes propri{\'{e}}t{\'{e}}s et {\'{e}}tudi{\'{e}} des variantes de SDAE comme strat{\'{e}}gie d’initialisation
2809 d’architecture profonde. Nous avons ainsi pu, entre autres choses, mettre en lumi{\`{e}}re
2810 l’influence du niveau de bruit, du nombre de couches et du nombre d’unit{\'{e}}s cach{\'{e}}es
2811 sur l’erreur de g{\'{e}}n{\'{e}}ralisation du SDAE. Nous avons constat{\'{e}} une am{\'{e}}lioration de la
2812 performance sur la t{\^{a}}che supervis{\'{e}}e avec l’utilisation des bruits poivre et sel (PS) et
2813 gaussien (GS), bruits s’av{\'{e}}rant mieux justifi{\'{e}}s que celui utilis{\'{e}} jusqu’{\`{a}} pr{\'{e}}sent, soit le
2814 masque {\`{a}} z{\'{e}}ro (MN). De plus, nous avons d{\'{e}}montr{\'{e}} que la performance profitait d’une
2815 emphase impos{\'{e}}e sur la reconstruction des donn{\'{e}}es corrompues durant l’entra{\^{\i}}nement
2816 des diff{\'{e}}rents DAE. Nos travaux ont aussi permis de r{\'{e}}v{\'{e}}ler que le DAE {\'{e}}tait en mesure d’apprendre, sur des images naturelles, des filtres semblables {\`{a}} ceux retrouv{\'{e}}s dans
2817 les cellules V1 du cortex visuel, soit des filtres d{\'{e}}tecteurs de bordures. Nous aurons par
2818 ailleurs pu montrer que les repr{\'{e}}sentations apprises du SDAE, compos{\'{e}}es des caract{\'{e}}ristiques ainsi extraites, s’av{\'{e}}raient fort utiles {\`{a}} l’apprentissage d’une machine {\`{a}} vecteurs de
2819 support ({SVM}) lin{\'{e}}aire ou {\`{a}} noyau gaussien, am{\'{e}}liorant grandement sa performance de
2820 g{\'{e}}n{\'{e}}ralisation. Aussi, nous aurons observ{\'{e}} que similairement au DBN, et contrairement
2821 au SAE, le SDAE poss{\'{e}}dait une bonne capacit{\'{e}} en tant que mod{\`{e}}le g{\'{e}}n{\'{e}}rateur. Nous
2822 avons {\'{e}}galement ouvert la porte {\`{a}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement et d{\'{e}}couvert le potentiel de l’une d’entre elles, soit l’empilement d’auto-encodeurs rebruiteurs
2823 (SRAE).}
2824 }
2825
2826 @INPROCEEDINGS{lamere+eck:ismir2007,
2827 author = {Lamere, Paul and Eck, Douglas},
2828 editor = {},
2829 title = {Using 3D Visualizations to Explore and Discover Music},
2830 booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
2831 year = {2007},
2832 publisher = {},
2833 source={OwnPublication},
2834 sourcetype={Conference},
2835 }
2836
2837 @ARTICLE{Larochelle+al-2010,
2838 author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph},
2839 title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest},
2840 journal = {Neural Computation},
2841 year = {2010},
2842 note = {To appear}
2843 }
2844
2845 @INPROCEEDINGS{Larochelle+Bengio-2008,
2846 author = {Larochelle, Hugo and Bengio, Yoshua},
2847 title = {Classification using Discriminative Restricted {B}oltzmann Machines},
2848 year = {2008},
2849 pages = {536--543},
2850 crossref = {ICML08-shorter},
2851 abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
2852 for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
2853 this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
2854 RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
2855 approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
2856 }
2857
2858 @INPROCEEDINGS{Larochelle-2009,
2859 author = {Larochelle, Hugo and Erhan, Dumitru and Vincent, Pascal},
2860 title = {Deep Learning using Robust Interdependent Codes},
2861 booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
2862 year = {2009},
2863 pages = {312--319},
2864 date = "April 16-18, 2009",
2865 }
2866
2867 @ARTICLE{Larochelle-jmlr-2009,
2868 author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
2869 title = {Exploring Strategies for Training Deep Neural Networks},
2870 volume = {10},
2871 year = {2009},
2872 pages = {1--40},
2873 crossref = {JMLR-shorter},
2874 abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
2875 }
2876
2877 @PHDTHESIS{Larochelle-PhD-2009,
2878 author = {Larochelle, Hugo},
2879 keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, autoassociateur, autoencodeur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
2880 title = {{\'{E}}tude de techniques d'apprentissage non-supervis{\'{e}} pour l'am{\'{e}}lioration de l'entra{\^{\i}}nement supervis{\'{e}} de mod{\`{e}}les connexionnistes},
2881 year = {2009},
2882 school = {University of Montr{\'{e}}al},
2883 abstract = {Le domaine de l'intelligence artificielle a pour objectif le d{\'{e}}veloppement de syst{\`{e}}mes informatiques capables de simuler des comportements normalement associ{\'{e}}s {\`{a}} l'intelligence humaine. On aimerait entre autres pouvoir construire une machine qui puisse
2884 r{\'{e}}soudre des t{\^{a}}ches li{\'{e}}es {\`{a}} la vision (e.g., la reconnaissance d'objet), au traitement de la langue (e.g., l'identification du sujet d'un texte) ou au traitement de signaux sonores (e.g., la reconnaissance de la parole).
2885 Une approche d{\'{e}}velopp{\'{e}}e afin de r{\'{e}}soudre ce genre de t{\^{a}}ches est bas{\'{e}}e sur l'apprentissage automatique de mod{\`{e}}les {\`{a}} partir de donn{\'{e}}es {\'{e}}tiquet{\'{e}}es refl{\'{e}}tant le comportement intelligent {\`{a}} {\'{e}}muler. Entre autre, il a {\'{e}}t{\'{e}} propos{\'{e}} de mod{\'{e}}liser le calcul n{\'{e}}cessaire {\`{a}} la
2886 r{\'{e}}solution d'une t{\^{a}}che {\`{a}} l'aide d'un r{\'{e}}seau de neurones artificiel, dont il est possible d'adapter le comportement {\`{a}} l'aide de la r{\'{e}}tropropagation [99, 131] d'un gradient informatif sur les erreurs commises par le r{\'{e}}seau. Populaire durant les ann{\'{e}}es 80, cette
2887 approche sp{\'{e}}cifique a depuis perdu partiellement de son attrait, suite au d{\'{e}}veloppement des m{\'{e}}thodes {\`{a}} noyau. Celles-ci sont souvent plus stables, plus faciles {\`{a}} utiliser et leur performance est souvent au moins aussi {\'{e}}lev{\'{e}}e pour une vaste gamme de probl{\`{e}}mes.
2888 Les m{\'{e}}thodes d'apprentissage automatique ont donc progress{\'{e}} dans leur fonctionnement, mais aussi dans la complexit{\'{e}} des probl{\`{e}}mes auxquels elles se sont attaqu{\'{e}}. Ainsi, plus r{\'{e}}cemment, des travaux [12, 15] ont commenc{\'{e}} {\`{a}} {\'{e}}mettre des doutes sur la capacit{\'{e}} des machines {\`{a}} noyau {\`{a}} pouvoir efficacement r{\'{e}}soudre des probl{\`{e}}mes de la complexit{\'{e}} requise par l'intelligence artificielle. Parall{\`{e}}lement, Hinton et al. [81] faisaient une perc{\'{e}}e dans l'apprentissage automatique de r{\'{e}}seaux de neurones, en proposant une proc{\'{e}}dure permettant l'entra{\^{\i}}nement de r{\'{e}}seaux de neurones d'une plus grande complexit{\'{e}} (i.e., avec plus de couches de neurones cach{\'{e}}es) qu'il n'{\'{e}}tait possible auparavant.
2889 C'est dans ce contexte qu'ont {\'{e}}t{\'{e}} conduits les travaux de cette th{\`{e}}se. Cette th{\`{e}}se d{\'{e}}bute par une exposition des principes de base de l'apprentissage automatique (chapitre 1) et une discussion des obstacles {\`{a}} l'obtention d'un mod{\`{e}}le ayant une bonne performance
2890 de g{\'{e}}n{\'{e}}ralisation (chapitre 2). Puis, sont pr{\'{e}}sent{\'{e}}es les contributions apport{\'{e}}es dans le cadre de cinq articles, contributions qui sont toutes bas{\'{e}}es sur l'utilisation d'une certaine
2891 forme d'apprentissage non-supervis{\'{e}}.
2892 Le premier article (chapitre 4) propose une m{\'{e}}thode d'entra{\^{\i}}nement pour un type sp{\'{e}}cifique de r{\'{e}}seau {\`{a}} une seule couche cach{\'{e}}e (la machine de {Boltzmann} restreinte) bas{\'{e}}e sur une combinaison des apprentissages supervis{\'{e}} et non-supervis{\'{e}}. Cette m{\'{e}}thode permet d'obtenir une meilleure performance de g{\'{e}}n{\'{e}}ralisation qu'un r{\'{e}}seau de neurones standard ou qu'une machine {\`{a}} vecteurs de support {\`{a}} noyau, et met en {\'{e}}vidence de fa{\c c}on
2893 explicite les b{\'{e}}n{\'{e}}fices qu'apporte l'apprentissage non-supervis{\'{e}} {\`{a}} l'entra{\^{\i}}nement d'un r{\'{e}}seau de neurones.
2894 Ensuite, dans le second article (chapitre 6), on {\'{e}}tudie et {\'{e}}tend la proc{\'{e}}dure d'entra{\^{\i}}nement propos{\'{e}}e par Hinton et al. [81]. Plus sp{\'{e}}cifiquement, on y propose une approche diff{\'{e}}rente mais plus flexible pour initialiser un r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es, bas{\'{e}}e sur un r{\'{e}}seau autoassociateur. On y explore aussi l'impact du nombre de couches et de neurones par couche sur la performance d'un r{\'{e}}seau et on y d{\'{e}}crit diff{\'{e}}rentes variantes mieux adapt{\'{e}}es {\`{a}} l'apprentissage en ligne ou pour donn{\'{e}}es {\`{a}} valeurs continues.
2895 Dans le troisi{\`{e}}me article (chapitre 8), on explore plut{\^{o}}t la performance de r{\'{e}}seaux profonds sur plusieurs probl{\`{e}}mes de classification diff{\'{e}}rents. Les probl{\`{e}}mes choisis ont la propri{\'{e}}t{\'{e}} d'avoir {\'{e}}t{\'{e}} g{\'{e}}n{\'{e}}r{\'{e}}s {\`{a}} partir de plusieurs facteurs de variation. Cette propri{\'{e}}t{\'{e}}, qui caract{\'{e}}rise les probl{\`{e}}mes li{\'{e}}s {\`{a}} l'intelligence artificielle, pose difficult{\'{e}} aux machines {\`{a}} noyau, tel que confirm{\'{e}} par les exp{\'{e}}riences de cet article.
2896 Le quatri{\`{e}}me article (chapitre 10) pr{\'{e}}sente une am{\'{e}}lioration de l'approche bas{\'{e}}e sur les r{\'{e}}seaux autoassociateurs. Cette am{\'{e}}lioration applique une modification simple {\`{a}} la proc{\'{e}}dure d'entra{\^{\i}}nement d'un r{\'{e}}seau autoassociateur, en « bruitant » les entr{\'{e}}es du r{\'{e}}seau afin que celui-ci soit forc{\'{e}} {\`{a}} la d{\'{e}}bruiter.
2897 Le cinqui{\`{e}}me et dernier article (chapitre 12) apporte une autre am{\'{e}}lioration aux r{\'{e}}seaux autoassociateurs, en permettant des interactions d'inhibition ou d'excitation entre les neurones cach{\'{e}}s de ces r{\'{e}}seaux. On y d{\'{e}}montre que de telles interactions peuvent
2898 {\^{e}}tre apprises et sont b{\'{e}}n{\'{e}}fiques {\`{a}} la performance d'un r{\'{e}}seau profond.}
2899 }
2900
2901 @INPROCEEDINGS{Larochelle2008,
2902 author = {Larochelle, Hugo and Erhan, Dumitru and Bengio, Yoshua},
2903 title = {Zero-data Learning of New Tasks},
2904 booktitle = {AAAI Conference on Artificial Intelligence},
2905 year = {2008},
2906 url = {http://www-etud.iro.umontreal.ca/~larocheh/publications/aaai2008_zero-data.pdf},
2907 abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
2908 for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
2909 this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
2910 RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
2911 approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
2912 }
2913
2914 @INPROCEEDINGS{LarochelleH2007,
2915 author = {Larochelle, Hugo and Erhan, Dumitru and Courville, Aaron and Bergstra, James and Bengio, Yoshua},
2916 title = {An Empirical Evaluation of Deep Architectures on Problems with Many Factors of Variation},
2917 year = {2007},
2918 pages = {473--480},
2919 crossref = {ICML07-shorter},
2920 abstract = {Recently, several learning algorithms relying on models with deep architectures have been proposed. Though they have demonstrated impressive performance, to date, they have only been evaluated on relatively simple problems such as digit recognition in a controlled environment, for which many machine learning algorithms already report reasonable results. Here, we present a series of experiments which indicate that these models show promise in solving harder learning problems that exhibit many factors of variation. These models are compared with well-established algorithms such as Support Vector Machines and single hidden-layer feed-forward neural networks.}
2921 }
2922
2923 @MASTERSTHESIS{Latendresse-MSc,
2924 author = {Latendresse, Simon},
2925 title = {L'utilisation d'hyper-param{\`{e}}tres pour la selection de variables},
2926 year = {1999},
2927 school = {Universit{\'{e}} de Montreal, Dept. IRO},
2928 note = {(in French)}
2929 }
2930
2931 @MASTERSTHESIS{Lauzon99,
2932 author = {Lauzon, Vincent-Philippe},
2933 title = {Mod{\'{e}}les statistiques comme algorithmes d'apprentissage et {MMCC}s; pr{\'{e}}diction de s{\'{e}}ries financi{\`{e}}res},
2934 year = {1999},
2935 school = {D{\'{e}}epartement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
2936 crossref = {DIRO}
2937 }
2938
2939 @INPROCEEDINGS{lecun-93,
2940 author = {{LeCun}, Yann and Bengio, Yoshua and Henderson, Donnie and Weisbuch, A. and Weissman, H. and L., Jackel},
2941 title = {On-line handwriting recognition with neural networks: spatial representation versus temporal representation.},
2942 booktitle = {Proc. International Conference on handwriting and drawing.},
2943 year = {1993},
2944 publisher = {Ecole Nationale Superieure des Telecommunications},
2945 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-93.ps.gz},
2946 topics={PriorKnowledge,Speech},cat={C},
2947 }
2948
2949 @INPROCEEDINGS{lecun-99,
2950 author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
2951 editor = {Forsyth, D.},
2952 title = {Object Recognition with Gradient-Based Learning},
2953 booktitle = {Shape, Contour and Grouping in Computer Vision},
2954 year = {1999},
2955 pages = {319-345},
2956 publisher = {Springer},
2957 url = {orig/lecun-99.ps.gz},
2958 topics={PriorKnowledge,Speech},cat={B},
2959 }
2960
2961 @TECHREPORT{lecun-99b,
2962 author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
2963 title = {Gradient-Based Learning for Object Detection, Segmentation and Recognition},
2964 year = {1999},
2965 institution = {AT\&T Labs},
2966 url = {orig/lecun-99b.ps.gz},
2967 topics={Speech},cat={T},
2968 }
2969
2970 @INPROCEEDINGS{lecun-bengio-94,
2971 author = {{LeCun}, Yann and Bengio, Yoshua},
2972 title = {Word-level training of a handwritten word recognizer based on convolutional neural networks},
2973 booktitle = {Proc. of the International Conference on Pattern Recognition},
2974 volume = {II},
2975 year = {1994},
2976 pages = {88--92},
2977 publisher = {IEEE},
2978 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-word.ps},
2979 abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. Words are represented by low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
2980 topics={Speech},cat={C},
2981 }
2982
2983 @INPROCEEDINGS{lecun-bengio-95a,
2984 author = {{LeCun}, Yann and Bengio, Yoshua},
2985 editor = {Arbib, M. A.},
2986 title = {Convolutional Networks for Images, Speech, and Time-Series},
2987 booktitle = {The Handbook of Brain Theory and Neural Networks},
2988 year = {1995},
2989 pages = {255--257},
2990 publisher = {MIT Press},
2991 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-convo.pdf},
2992 topics={PriorKnowledge,Speech},cat={C},
2993 }
2994
2995 @INCOLLECTION{lecun-bengio-95b,
2996 author = {{LeCun}, Yann and Bengio, Yoshua},
2997 editor = {Arbib, M. A.},
2998 title = {Pattern Recognition and Neural Networks},
2999 booktitle = {The Handbook of Brain Theory and Neural Networks},
3000 year = {1995},
3001 pages = {711--714},
3002 publisher = {MIT Press},
3003 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-patrec.pdf},
3004 topics={PriorKnowledge,Speech},cat={B},
3005 }
3006
3007 @ARTICLE{LeCun98,
3008 author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
3009 title = {Gradient-Based Learning Applied to Document Recognition},
3010 journal = {Proceedings of the IEEE},
3011 volume = {86},
3012 number = {11},
3013 year = {1998},
3014 pages = {2278--2324},
3015 abstract = {Multilayer Neural Networks trained with the backpropagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques.
3016 Real-life document recognition systems are composed or multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to minimize an overall performance measure.
3017 Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks.
3018 A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with global training techniques to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
3019 topics={PriorKnowledge,Speech},cat={C},
3020 }
3021
3022 @INPROCEEDINGS{Lecun_icassp97,
3023 author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
3024 title = {Reading Checks with graph transformer networks},
3025 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
3026 volume = {1},
3027 year = {1997},
3028 pages = {151--154},
3029 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-bottou-bengio-97.ps.gz},
3030 topics={Speech},cat={C},
3031 }
3032
3033 @ARTICLE{LeRoux+Bengio-2010,
3034 author = {Le Roux, Nicolas and Bengio, Yoshua},
3035 title = {Deep Belief Networks are Compact Universal Approximators},
3036 journal = {Neural Computation},
3037 year = {2010},
3038 note = {To appear}
3039 }
3040
3041 @TECHREPORT{LeRoux-Bengio-2007-TR,
3042 author = {Le Roux, Nicolas and Bengio, Yoshua},
3043 title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
3044 number = {1294},
3045 year = {2007},
3046 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3047 abstract = {Deep Belief Networks (DBN) are generative neural network models with
3048 many layers of hidden explanatory factors, recently introduced by Hinton et al.,
3049 along with a greedy layer-wise unsupervised learning algorithm. The building
3050 block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine
3051 (RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines
3052 are interesting because inference is easy in them, and because they have been
3053 successfully used as building blocks for training deeper models.
3054 We first prove that adding hidden units yields strictly improved modeling
3055 power, while a second theorem shows that RBMs are universal approximators of
3056 discrete distributions. We then study the question of whether DBNs with more
3057 layers are strictly more powerful in terms of representational power. This
3058 suggests a new and less greedy criterion for training RBMs within DBNs.}
3059 }
3060
3061 @ARTICLE{LeRoux-Bengio-2008,
3062 author = {Le Roux, Nicolas and Bengio, Yoshua},
3063 title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
3064 journal = {Neural Computation},
3065 volume = {20},
3066 number = {6},
3067 year = {2008},
3068 pages = {1631--1649},
3069 abstract = {Deep Belief Networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton et al., along with a greedy layer-wise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine (RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines are interesting because inference is easy in them, and because they have been successfully used as building blocks for training deeper models. We first prove that adding hidden units yields strictly improved modelling power, while a second theorem shows that RBMs are universal approximators of discrete distributions. We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}
3070 }
3071
3072 @INPROCEEDINGS{LeRoux-continuous,
3073 author = {Le Roux, Nicolas and Bengio, Yoshua},
3074 title = {Continuous Neural Networks},
3075 booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
3076 year = {2007},
3077 publisher = {Omnipress},
3078 abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based learning. While having the same number of parameters as an ordinary neural network, its internal structure suggests that it can represent some smooth functions much more compactly. Under mild assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this parametrization may help reducing the problem of saturation of the neurons. In a second approach, the input-to-hidden weights arefully non-parametric, yielding a kernel machine for which we demonstrate a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free and still generalizes in spite of an absence of explicit regularization.}
3079 }
3080
3081 @PHDTHESIS{LeRoux-PhD-2008,
3082 author = {Le Roux, Nicolas},
3083 title = {Avanc{\'{e}}es th{\'{e}}oriques sur la repr{\'{e}}sentation et l'optimisation des r{\'{e}}seaux de neurones},
3084 year = {2008},
3085 school = {Universit{\'{e}} de Montr{\'{e}}al},
3086 abstract = {Les r{\'{e}}seaux de neurones artificiels ont {\'{e}}t{\'{e}} abondamment utilis{\'{e}}s dans la communaut{\'{e}} de l'apprentissage machine depuis les ann{\'{e}}es 80. Bien qu'ils aient {\'{e}}t{\'{e}} {\'{e}}tudi{\'{e}}s pour la premi{\`{e}}re fois il y a cinquante ans par Rosenblatt [68], ils ne furent r{\'{e}}ellement populaires qu'apr{\`{e}}s l'apparition de la r{\'{e}}tropropagation du gradient, en 1986 [71].
3087 En 1989, il a {\'{e}}t{\'{e}} prouv{\'{e}} [44] qu'une classe sp{\'{e}}cifique de r{\'{e}}seaux de neurones (les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e) {\'{e}}tait suffisamment puissante pour pouvoir approximer presque n'importe quelle fonction avec une pr{\'{e}}cision arbitraire : le th{\'{e}}or{\`{e}}me d'approximation universelle. Toutefois, bien que ce th{\'{e}}or{\`{e}}me e{\^{u}}t pour cons{\'{e}}quence un int{\'{e}}r{\^{e}}t accru pour les r{\'{e}}seaux de neurones, il semblerait qu'aucun effort n'ait {\'{e}}t{\'{e}} fait pour profiter de cette propri{\'{e}}t{\'{e}}.
3088 En outre, l'optimisation des r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e n'est pas convexe. Cela a d{\'{e}}tourn{\'{e}} une grande partie de la communaut{\'{e}} vers d'autres algorithmes, comme par exemple les machines {\`{a}} noyau (machines {\`{a}} vecteurs de support et r{\'{e}}gression
3089 {\`{a}} noyau, entre autres).
3090 La premi{\`{e}}re partie de cette th{\`{e}}se pr{\'{e}}sentera les concepts d'apprentissage machine g{\'{e}}n{\'{e}}raux n{\'{e}}cessaires {\`{a}} la compr{\'{e}}hension des algorithmes utilis{\'{e}}s. La deuxi{\`{e}}me partie se focalisera plus sp{\'{e}}cifiquement sur les m{\'{e}}thodes {\`{a}} noyau et les r{\'{e}}seaux de neurones. La troisi{\`{e}}me partie de ce travail visera ensuite {\`{a}} {\'{e}}tudier les limitations des machines {\`{a}} noyaux et {\`{a}} comprendre les raisons pour lesquelles elles sont inadapt{\'{e}}es {\`{a}} certains probl{\`{e}}mes que nous avons {\`{a}} traiter.
3091 La quatri{\`{e}}me partie pr{\'{e}}sente une technique permettant d'optimiser les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e de mani{\`{e}}re convexe. Bien que cette technique s'av{\`{e}}re difficilement exploitable pour des probl{\`{e}}mes de grande taille, une version approch{\'{e}}e permet d'obtenir une bonne solution dans un temps raisonnable.
3092 La cinqui{\`{e}}me partie se concentre sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e infinie. Cela leur permet th{\'{e}}oriquement d'exploiter la propri{\'{e}}t{\'{e}} d'approximation universelle et ainsi d'approcher facilement une plus grande classe de fonctions.
3093 Toutefois, si ces deux variations sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e leur conf{\`{e}}rent des propri{\'{e}}t{\'{e}}s int{\'{e}}ressantes, ces derniers ne peuvent extraire plus que des concepts de bas niveau. Les m{\'{e}}thodes {\`{a}} noyau souffrant des m{\^{e}}mes limites, aucun de
3094 ces deux types d'algorithmes ne peut appr{\'{e}}hender des probl{\`{e}}mes faisant appel {\`{a}} l'apprentissage de concepts de haut niveau.
3095 R{\'{e}}cemment sont apparus les Deep Belief Networks [39] qui sont des r{\'{e}}seaux de neurones {\`{a}} plusieurs couches cach{\'{e}}es entra{\^{\i}}n{\'{e}}s de mani{\`{e}}re efficace. Cette profondeur leur permet d'extraire des concepts de haut niveau et donc de r{\'{e}}aliser des t{\^{a}}ches hors
3096 de port{\'{e}}e des algorithmes conventionnels. La sixi{\`{e}}me partie {\'{e}}tudie des propri{\'{e}}t{\'{e}}s de ces r{\'{e}}seaux profonds.
3097 Les probl{\`{e}}mes que l'on rencontre actuellement n{\'{e}}cessitent non seulement des algorithmes capables d'extraire des concepts de haut niveau, mais {\'{e}}galement des m{\'{e}}thodes d'optimisation capables de traiter l'immense quantit{\'{e}} de donn{\'{e}}es parfois disponibles, si possible en temps r{\'{e}}el. La septi{\`{e}}me partie est donc la pr{\'{e}}sentation d'une nouvelle technique permettant une optimisation plus rapide.}
3098 }
3099
3100 @ARTICLE{lheureux-04,
3101 author = {{L'Heureux}, Pierre-Jean and Carreau, Julie and Bengio, Yoshua and Delalleau, Olivier and Yue, Shi Yi},
3102 title = {Locally Linear Embedding for dimensionality reduction in {QSAR}},
3103 journal = {Journal of Computer-Aided Molecular Design},
3104 volume = {18},
3105 year = {2004},
3106 pages = {475--482},
3107 abstract = {Current practice in Quantitative Structure Activity Relationship (QSAR) methods usually involves generating a great number of chemical descriptors and then cutting them back with variable selection techniques. Variable selection is an effective method to reduce the dimensionality but may discard some valuable information. This paper introduces Locally Linear Embedding ({LLE}), a local non-linear dimensionality reduction technique, that can statistically discover a low-dimensional representation of the chemical data. {LLE} is shown to create more stable representations than other non-linear dimensionality
3108 reduction algorithms, and to be capable of capturing non-linearity in chemical data.},
3109 topics={Bioinformatic},cat={J},
3110 }
3111
3112 @TECHREPORT{lm-TR00,
3113 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
3114 title = {A Neural Probabilistic Language Model},
3115 number = {1178},
3116 year = {2000},
3117 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3118 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1178.pdf},
3119 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made or words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model, and that the proposed approach allows to take advantage of much longer contexts.},
3120 topics={Markov,Unsupervised,Language},cat={T},
3121 }
3122
3123 @INPROCEEDINGS{Maillet+al-2009,
3124 author = {Maillet, Fran{\c c}ois and Eck, Douglas and Desjardins, Guillaume and Lamere, Paul},
3125 title = {Steerable Playlist Generation by Learning Song Similarity from Radio Station Playlists},
3126 booktitle = {Proceedings of the 10th International Conference on Music Information Retrieval},
3127 year = {2009},
3128 url = {http://www-etud.iro.umontreal.ca/~mailletf/papers/ismir09-playlist.pdf},
3129 abstract = {This paper presents an approach to generating steerable playlists. We first demonstrate a method for learning song transition probabilities from audio features extracted from songs played in professional radio station playlists. We then show that by using this learnt similarity function as a prior, we are able to generate steerable playlists by choosing the next song to play not simply based on that prior, but on a tag cloud that the user is able to manipulate to express the high-level characteristics of the music he wishes Last.fm, to listen to.}
3130 }
3131
3132 @INPROCEEDINGS{manzagol+bertinmahieux+eck:ismir2008,
3133 author = {Manzagol, Pierre-Antoine and Bertin-Mahieux, Thierry and Eck, Douglas},
3134 title = {On the Use of Sparse Time-Relative Auditory Codes for Music},
3135 booktitle = {{Proceedings of the 9th International Conference on Music Information Retrieval ({ISMIR} 2008)}},
3136 year = {2008},
3137 abstract = {Many if not most audio features used in MIR research are inspired by work done in speech recognition and are variations on the spectrogram. Recently, much attention has been given to new representations of audio that are sparse and time-relative. These representations are efficient and able to avoid the time-frequency trade-off of a spectrogram. Yet little work with music streams has been conducted and these features remain mostly unused in the MIR community. In this paper we further explore the use of these features for musical signals. In particular, we investigate their use on realistic music examples (i.e. released commercial music) and their use as input features for supervised learning. Furthermore, we identify three specific issues related to these features which will need to be further addressed in order to obtain the full benefit for MIR applications.},
3138 source={OwnPublication},
3139 sourcetype={Conference},
3140 }
3141
3142 @MASTERSTHESIS{Manzagol-Msc-2007,
3143 author = {Manzagol, Pierre-Antoine},
3144 key = {Algorithme d'apprentissage, méthode de second ordre, gradient naturel, approximation stochastique},
3145 title = {TONGA - Un algorithme de gradient naturel pour les probl{\`{e}}mes de grande taille},
3146 year = {2007},
3147 school = {Universit{\'{e}} de Montr{\'{e}}al},
3148 abstract = {Les syst{\`{e}}mes adaptatifs sont confront{\'{e}}s {\`{a}} des donn{\'{e}}es qui {\'{e}}voluent rapidement en quantit{\'{e}} et en complexit{\'{e}}. Les avanc{\'{e}}es mat{\'{e}}rielles de l'informatique ne susent pas {\`{a}} compenser cet essor. Une mise {\`{a}} l'{\'{e}}chelle des techniques d'apprentissage est n{\'{e}}cessaire. D'une part, les mod{\`{e}}les doivent gagner en capacit{\'{e}} de repr{\'{e}}sentation. De l'autre, les algorithmes d'apprentissage doivent devenir plus ecaces.
3149 Nos travaux se situent dans ce contexte des probl{\`{e}}mes de grande taille et portent sur l'am{\'{e}}lioration des algorithmes d'apprentissage. Deux {\'{e}}l{\'{e}}ments de r{\'{e}}ponse sont d{\'{e}}j{\`{a}} connus. Il s'agit des m{\'{e}}thodes de second ordre et de l'approximation stochastique. Or, les m{\'{e}}thodes de second ordre poss{\`{e}}dent des complexit{\'{e}}s en calculs et en m{\'{e}}moire qui sont prohibitives dans le cadre des probl{\`{e}}mes de grande taille. {\'{E}}galement, il est notoirement dicile de concilier ces m{\'{e}}thodes avec l'approximation stochastique. TONGA est un algorithme d'apprentissage con{\c c}u pour faire face {\`{a}} ces dicult{\'{e}}s. Il s'agit d'une implantation stochastique et adapt{\'{e}}e aux probl{\`{e}}mes de grande taille d'une m{\'{e}}thode de second ordre, le gradient naturel. Dans ce m{\'{e}}moire, nous examinons de pr{\`{e}}s ce nouvel algorithme d'apprentissage en le comparant sur plusieurs probl{\`{e}}mes au gradient stochastique, la technique d'optimisation commun{\'{e}}ment utilis{\'{e}}e dans le cadre des probl{\`{e}}mes de grande taille. Nos exp{\'{e}}riences montrent que TONGA est au moins tout aussi ecace que le gradient stochastique, ce qui est un accomplissement en soit. Dans certains cas, TONGA offre une convergence nettement sup{\'{e}}rieure {\`{a}} celle du gradient stochastique.}
3150 }
3151
3152 @INPROCEEDINGS{matic-94,
3153 author = {Matic, N. and Henderson, Donnie and {LeCun}, Yann and Bengio, Yoshua},
3154 title = {Pen-based visitor registration system (PENGUIN)},
3155 booktitle = {Conference Record of the Twenty-Eighth Asilomar Conference on Signals, Systems and Computers},
3156 year = {1994},
3157 publisher = {IEEE},
3158 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/matic-94.tiff},
3159 abstract = {We describe a new electronic pen-based visitors registration system (PENGUIN) whose goal is to expand and modernize the visitor sign-in procedure at Bell Laboratories. The system uses a pen-interface (i.e. tablet-display) in what is essentially a form filling application. Our pen-interface is coupled with a powerful and accurate on-line handwriting recognition module. A database of AT&T employees (the visitors' hosts) and country names is used to check the recognition module outputs, in order to find the best match. The system provides assistance to the guard at one of the guard stations in routing visitors to their hosts. All the entered data are stored electronically. Initial testing shows that PENGUIN system performs reliably and with high accuracy. It retrieves the correct host name with 97\% accuracy and the correct visitors citizenship with 99\% accuracy. The system is robust and easy to use for both visitors and guards},
3160 topics={Speech},cat={C},
3161 }
3162
3163 @UNPUBLISHED{mirex2005artist,
3164 author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
3165 title = {Artist Recognition: A Timbre- and Rhythm-Based Multiresolution Approach},
3166 year = {2005},
3167 note = {{MIREX} artist recognition contest},
3168 source={OwnPublication},
3169 sourcetype={Other},
3170 }
3171
3172 @UNPUBLISHED{mirex2005genre,
3173 author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
3174 title = {Genre Classification: Timbre- and Rhythm-Based Multiresolution Audio Classification},
3175 year = {2005},
3176 note = {{MIREX} genre classification contest},
3177 source={OwnPublication},
3178 sourcetype={Other},
3179 }
3180
3181 @UNPUBLISHED{mirex2005note,
3182 author = {Lacoste, Alexandre and Eck, Douglas},
3183 title = {Onset Detection with Artificial Neural Networks},
3184 year = {2005},
3185 note = {{MIREX} note onset detection contest},
3186 source={OwnPublication},
3187 sourcetype={Other},
3188 }
3189
3190 @UNPUBLISHED{mirex2005tempo,
3191 author = {Eck, Douglas and Casagrande, Norman},
3192 title = {A Tempo-Extraction Algorithm Using an Autocorrelation Phase Matrix and Shannon Entropy},
3193 year = {2005},
3194 note = {{MIREX} tempo extraction contest (www.music-ir.org/\-evaluation/\-mirex-results)},
3195 source={OwnPublication},
3196 sourcetype={Other},
3197 }
3198
3199 @INPROCEEDINGS{mitacs-insurance01,
3200 author = {Bengio, Yoshua and Chapados, Nicolas and Dugas, Charles and Ghosn, Joumana and Takeuchi, Ichiro and Vincent, Pascal},
3201 title = {High-Dimensional Data Inference for Automobile Insurance Premia Estimation},
3202 booktitle = {Presented at the 2001 MITACS Annual Meeting},
3203 year = {2001},
3204 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mitacs_insurance.ps},
3205 topics={HighDimensional,Mining},cat={C},
3206 }
3207
3208 @INPROCEEDINGS{Morin+al-2005,
3209 author = {Morin, Frederic and Bengio, Yoshua},
3210 editor = {Cowell, Robert G. and Ghahramani, Zoubin},
3211 title = {Hierarchical Probabilistic Neural Network Language Model},
3212 booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
3213 year = {2005},
3214 pages = {246--252},
3215 publisher = {Society for Artificial Intelligence and Statistics},
3216 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf},
3217 abstract = {In recent years, variants of a neural network architecture for statistical language modeling have been proposed and successfully applied, e.g. in the language modeling component of speech recognizers. The main advantage of these architectures is that they learn an embedding for words (or other symbols) in a continuous space that helps to smooth the language model and provide good generalization even when the number of training examples is insufficient. However, these models are extremely slow in comparison to the more commonly used n-gram models, both for training and recognition. As an alternative to an importance sampling method proposed to speed-up training, we introduce a hierarchical decomposition of the conditional probabilities that yields a speed-up of about 200 both during training and recognition. The hierarchical decomposition is a binary hierarchical clustering constrained by the prior knowledge extracted from the WordNet semantic hierarchy.},
3218 topics={Language},cat={C},
3219 }
3220
3221 @TECHREPORT{Nadeau-inference-TR99,
3222 author = {Nadeau, Claude and Bengio, Yoshua},
3223 title = {Inference and the Generalization Error},
3224 number = {99s-45},
3225 year = {1999},
3226 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3227 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/techrep.pdf},
3228 abstract = {We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets and test examples. This allows us to propose two new estimators of this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered in (Dietterich, 1998). In particular, tests of hypothesis based on these don’t tend to be too liberal like other tests currently available, and have good power.},
3229 topics={Comparative},cat={T},
3230 }
3231
3232 @INPROCEEDINGS{nadeau:2000:nips,
3233 author = {Nadeau, Claude and Bengio, Yoshua},
3234 title = {Inference for the Generalization Error},
3235 year = {2000},
3236 pages = {307--313},
3237 crossref = {NIPS12-shorter},
3238 abstract = {In order to to compare learning algorithms, experimental results reported in the machine learning litterature often use statistical tests of significance. Unfortunately, most of these tests do not take into account the variability due to the choice of training set. We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets. This allows us to propose two new ways to estimate this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered by Dietterich (Dietterich, 1998).},
3239 topics={Comparative},cat={C},
3240 }
3241
3242 @ARTICLE{nadeau:2001,
3243 author = {Nadeau, Claude and Bengio, Yoshua},
3244 title = {Inference for the Generalization Error},
3245 journal = {Machine Learning},
3246 year = {2001},
3247 abstract = {In order to compare learning algorithms, experimental results reported in the machine learning literature often use statistical tests of significance to support the claim that a new learning algorithm generalizes better. Such tests should take into account the variability due to the choice of training set and not only that due to the test examples, as is often the case. This could lead to gross underestimation of the variance of the cross-validation estimator, and to the wrong conclusion that the new algorithm is significantly better when it is not. We perform a theoretical investigation of the variance of a cross-validation estimator of the generalization error that takes into account the variability due to the randomness of the training set as well as test examples. Our analysis shows that all the variance estimators that are based only on the results of the cross-validation experiment must be biased. This analysis allows us to propose new estimators of this variance. We show, via simulations, that tests of hypothesis about the generalization error using those new variance estimators have better properties than tests involving variance estimators currently in use and listed in (Dietterich, 1998). In particular, the new tests have correct size and good power. That is, the new tests do not reject the null hypothesis too often when the hypothesis is true, but they tend to frequently reject the null hypothesis when the latter is false.},
3248 topics={Comparative},cat={J},
3249 }
3250
3251 @ARTICLE{NC06,
3252 author = {Bengio, Yoshua and Monperrus, Martin and Larochelle, Hugo},
3253 title = {Nonlocal Estimation of Manifold Structure},
3254 journal = {Neural Computation},
3255 volume = {18},
3256 year = {2006},
3257 pages = {2509--2528},
3258 abstract = {We claim and present arguments to the effect that a large class of manifold
3259 learning algorithms that are essentially local and can be framed as
3260 kernel learning algorithms will suffer from the curse of dimensionality, at
3261 the dimension of the true underlying manifold. This observation suggests
3262 to explore non-local manifold learning algorithms which attempt to discover
3263 shared structure in the tangent planes at different positions. A criterion for
3264 such an algorithm is proposed and experiments estimating a tangent plane
3265 prediction function are presented, showing its advantages with respect to
3266 local manifold learning algorithms: it is able to generalize very far from
3267 training data (on learning handwritten character image rotations), where a
3268 local non-parametric method fails.},
3269 topics={HighDimensional,Kernel,Unsupervised},cat={J},
3270 }
3271
3272 @INPROCEEDINGS{NIPS1-short,
3273 editor = {Touretzky, D. S.},
3274 title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
3275 booktitle = {NIPS 1},
3276 year = {-1},
3277 publisher = {Morgan Kaufmann}
3278 }
3279
3280
3281 @INPROCEEDINGS{NIPS10-short,
3282 editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
3283 title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
3284 booktitle = {NIPS 10},
3285 year = {-1},
3286 publisher = {MIT Press}
3287 }
3288
3289
3290 @INPROCEEDINGS{NIPS11,
3291 editor = {Kearns, M.J. and Solla, S.A.},
3292 title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
3293 booktitle = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
3294 year = {-1},
3295 publisher = {MIT Press}
3296 }
3297
3298 @INPROCEEDINGS{NIPS11-short,
3299 editor = {Kearns, M.J. and Solla, S.A.},
3300 title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
3301 booktitle = {NIPS 11},
3302 year = {-1},
3303 publisher = {MIT Press}
3304 }
3305
3306
3307 @INPROCEEDINGS{NIPS12-short,
3308 editor = {Solla, S.A. and Leen, T. K.},
3309 title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
3310 booktitle = {NIPS 12},
3311 year = {-1},
3312 publisher = {MIT Press}
3313 }
3314
3315
3316 @INPROCEEDINGS{NIPS13-short,
3317 editor = {Leen, T. K. and Dietterich, T.G.},
3318 title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
3319 booktitle = {NIPS 13},
3320 year = {-1},
3321 publisher = {MIT Press}
3322 }
3323
3324
3325 @INPROCEEDINGS{NIPS14,
3326 editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
3327 title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
3328 booktitle = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
3329 year = {-1},
3330 publisher = {MIT Press}
3331 }
3332
3333 @INPROCEEDINGS{NIPS14-short,
3334 editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
3335 title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
3336 booktitle = {NIPS 14},
3337 year = {-1},
3338 publisher = {MIT Press}
3339 }
3340
3341
3342 @INPROCEEDINGS{NIPS15-short,
3343 editor = {Becker, S. and Thrun, Sebastian},
3344 title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
3345 booktitle = {NIPS 15},
3346 year = {-1},
3347 publisher = {MIT Press}
3348 }
3349
3350
3351 @INPROCEEDINGS{NIPS16-short,
3352 editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
3353 title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
3354 booktitle = {NIPS 16},
3355 year = {-1}
3356 }
3357
3358
3359 @INPROCEEDINGS{NIPS17-short,
3360 editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
3361 title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
3362 booktitle = {NIPS 17},
3363 year = {-1}
3364 }
3365
3366
3367 @INPROCEEDINGS{NIPS18-short,
3368 editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
3369 title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
3370 booktitle = {NIPS 18},
3371 year = {-1},
3372 publisher = {MIT Press}
3373 }
3374
3375
3376 @INPROCEEDINGS{NIPS19-short,
3377 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
3378 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
3379 booktitle = {NIPS 19},
3380 year = {-1},
3381 publisher = {MIT Press}
3382 }
3383
3384
3385 @INPROCEEDINGS{NIPS2-short,
3386 editor = {Touretzky, D. S.},
3387 title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
3388 booktitle = {NIPS 2},
3389 year = {-1},
3390 publisher = {Morgan Kaufmann}
3391 }
3392
3393
3394 @INPROCEEDINGS{NIPS20-short,
3395 editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
3396 title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
3397 booktitle = {NIPS 20},
3398 year = {-1},
3399 publisher = {MIT Press}
3400 }
3401
3402
3403 @INPROCEEDINGS{NIPS2003_AA65,
3404 author = {Bengio, Yoshua and Grandvalet, Yves},
3405 keywords = {cross validation, error bars, generalization error inference, k-fold cross-validation, model selection, statistical comparison of algorithms, variance estimate},
3406 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
3407 year = {2004},
3408 publisher = {MIT Press},
3409 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/var-kfold-part1-nips.pdf},
3410 crossref = {NIPS16},
3411 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare algorithm performances. In order to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the estimation of uncertainty around the K-fold cross-validation estimator. The main theorem shows that there exists no universal unbiased estimator of the variance of K-fold cross-validation. An analysis based on the eigendecomposition of the covariance matrix of errors helps to better understand the nature of the problem and shows that naive estimators may grossly underestimate variance, as confirmed by numerical experiments.},
3412 topics={Comparative},cat={C},
3413 }
3414
3415 @INCOLLECTION{NIPS2005_424,
3416 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
3417 title = {The Curse of Highly Variable Functions for Local Kernel Machines},
3418 year = {2006},
3419 pages = {107--114},
3420 crossref = {NIPS18-shorter},
3421 abstract = {We present a series of theoretical arguments supporting the claim that a
3422 large class of modern learning algorithms that rely solely on the smoothness
3423 prior – with similarity between examples expressed with a local
3424 kernel – are sensitive to the curse of dimensionality, or more precisely
3425 to the variability of the target. Our discussion covers supervised, semisupervised
3426 and unsupervised learning algorithms. These algorithms are
3427 found to be local in the sense that crucial properties of the learned function
3428 at x depend mostly on the neighbors of x in the training set. This
3429 makes them sensitive to the curse of dimensionality, well studied for
3430 classical non-parametric statistical learning. We show in the case of the
3431 Gaussian kernel that when the function to be learned has many variations,
3432 these algorithms require a number of training examples proportional to
3433 the number of variations, which could be large even though there may exist
3434 short descriptions of the target function, i.e. their Kolmogorov complexity
3435 may be low. This suggests that there exist non-local learning
3436 algorithms that at least have the potential to learn about such structured
3437 but apparently complex functions (because locally they have many variations),
3438 while not using very specific prior domain knowledge.},
3439 topics={HighDimensional,Kernel,Unsupervised},cat={C},
3440 }
3441
3442 @INPROCEEDINGS{NIPS2005_456,
3443 author = {K{\'{e}}gl, Bal{\'{a}}zs and Wang, Ligen},
3444 title = {Boosting on Manifolds: Adaptive Regularization of Base Classifiers},
3445 year = {2005},
3446 pages = {665--672},
3447 crossref = {NIPS17-shorter},
3448 abstract = {In this paper we propose to combine two powerful ideas, boosting and manifold learning. On the one hand, we improve ADABOOST by incorporating knowledge on the structure of the data into base classifier design and selection. On the other hand, we use ADABOOST’s efficient learning mechanism to significantly improve supervised and semi-supervised algorithms proposed in the context of manifold learning. Beside the specific manifold-based penalization, the resulting algorithm also accommodates the boosting of a large family of regularized learning algorithms.},
3449 topics={Boosting},cat={C},
3450 }
3451
3452 @INCOLLECTION{NIPS2005_519,
3453 author = {Grandvalet, Yves and Bengio, Yoshua},
3454 title = {Semi-supervised Learning by Entropy Minimization},
3455 year = {2005},
3456 pages = {529--236},
3457 crossref = {NIPS17-shorter},
3458 abstract = {We consider the semi-supervised learning problem, where a decision rule is to be learned from labeled and unlabeled data. In this framework, we motivate minimum entropy regularization, which enables to incorporate unlabeled data in the standard supervised learning. Our approach includes other approaches to the semi-supervised problem as particular or limiting cases. A series of experiments illustrates that the proposed solution benefits from unlabeled data. The method challenges mixture models when the data are sampled from the distribution class spanned by the generative model. The performances are definitely in favor of minimum entropy regularization when generative models are misspecified, and the weighting of unlabeled data provides robustness to the violation of the “cluster assumption”. Finally, we also illustrate that the method can also be far superior to manifold learning in high dimension spaces.},
3459 topics={Unsupervised},cat={C},
3460 }
3461
3462 @INPROCEEDINGS{NIPS2005_539,
3463 author = {Bengio, Yoshua and Larochelle, Hugo and Vincent, Pascal},
3464 title = {Non-Local Manifold Parzen Windows},
3465 year = {2006},
3466 crossref = {NIPS18-shorter},
3467 abstract = {To escape from the curse of dimensionality, we claim that one can learn
3468 non-local functions, in the sense that the value and shape of the learned
3469 function at x must be inferred using examples that may be far from x.
3470 With this objective, we present a non-local non-parametric density estimator.
3471 It builds upon previously proposed Gaussian mixture models with
3472 regularized covariance matrices to take into account the local shape of
3473 the manifold. It also builds upon recent work on non-local estimators of
3474 the tangent plane of a manifold, which are able to generalize in places
3475 with little training data, unlike traditional, local, non-parametric models.},
3476 topics={HighDimensional,Kernel,Unsupervised},cat={C},
3477 }
3478
3479 @INPROCEEDINGS{NIPS2005_583,
3480 author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
3481 title = {Convex Neural Networks},
3482 year = {2006},
3483 pages = {123--130},
3484 crossref = {NIPS18-shorter},
3485 abstract = {Convexity has recently received a lot of attention in the machine learning
3486 community, and the lack of convexity has been seen as a major disadvantage
3487 of many learning algorithms, such as multi-layer artificial neural
3488 networks. We show that training multi-layer neural networks in which the
3489 number of hidden units is learned can be viewed as a convex optimization
3490 problem. This problem involves an infinite number of variables, but can be
3491 solved by incrementally inserting a hidden unit at a time, each time finding
3492 a linear classifier that minimizes a weighted sum of errors.},
3493 topics={Boosting},cat={C},
3494 }
3495
3496 @INPROCEEDINGS{NIPS2005_663,
3497 author = {Rivest, Fran{\c c}ois and Bengio, Yoshua and Kalaska, John},
3498 title = {Brain Inspired Reinforcement Learning},
3499 year = {2005},
3500 pages = {1129--1136},
3501 crossref = {NIPS17-shorter},
3502 abstract = {Successful application of reinforcement learning algorithms often involves considerable hand-crafting of the necessary non-linear features to reduce the complexity of the value functions and hence to promote convergence of the algorithm. In contrast, the human brain readily and autonomously finds the complex features when provided with sufficient training. Recent work in machine learning and neurophysiology has demonstrated the role of the basal ganglia and the frontal cortex in mammalian reinforcement learning. This paper develops and explores new reinforcement learning algorithms inspired by neurological evidence that provides potential new approaches to the feature construction problem. The algorithms are compared and evaluated on the Acrobot task.},
3503 topics={BioRules},cat={C},
3504 }
3505
3506 @INCOLLECTION{NIPS2005_691,
3507 author = {Bengio, Yoshua and Monperrus, Martin},
3508 title = {Non-Local Manifold Tangent Learning},
3509 year = {2005},
3510 pages = {129--136},
3511 crossref = {NIPS17-shorter},
3512 abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local and can be framed as kernel learning algorithms will suffer from the curse of dimensionality, at the dimension of the true underlying manifold. This observation suggests to explore non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented, showing its advantages with respect to local manifold learning algorithms: it is able to generalize very far from training data (on learning handwritten character image rotations), where a local non-parametric method fails.},
3513 topics={HighDimensional,Unsupervised},cat={C},
3514 }
3515
3516 @INPROCEEDINGS{NIPS2005_874,
3517 author = {K{\'{e}}gl, Bal{\'{a}}zs},
3518 title = {Generalization Error and Algorithmic Convergence of Median Boosting},
3519 year = {2005},
3520 pages = {657--664},
3521 crossref = {NIPS17-shorter},
3522 abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.},
3523 topics={Boosting},cat={C},
3524 }
3525
3526 @INPROCEEDINGS{NIPS2007-56,
3527 author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
3528 title = {Topmoumoute online natural gradient algorithm},
3529 year = {2008},
3530 crossref = {NIPS20-shorter},
3531 abstract = {Guided by the goal of obtaining an optimization algorithm that is both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
3532 }
3533
3534 @INPROCEEDINGS{NIPS2007-812,
3535 author = {Chapados, Nicolas and Bengio, Yoshua},
3536 title = {Augmented Functional Time Series Representation and Forecasting with Gaussian Processes},
3537 year = {2008},
3538 pages = {265--272},
3539 crossref = {NIPS20-shorter},
3540 abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
3541 }
3542
3543 @INPROCEEDINGS{NIPS2007-925,
3544 author = {Le Roux, Nicolas and Bengio, Yoshua and Lamblin, Pascal and Joliveau, Marc and K{\'{e}}gl, Bal{\'{a}}zs},
3545 title = {Learning the 2-D Topology of Images},
3546 year = {2008},
3547 pages = {841--848},
3548 crossref = {NIPS20-shorter},
3549 abstract = {We study the following question: is the two-dimensional structure of images a very strong prior or is it something that can be learned with a few examples of natural images? If someone gave us a learning task involving images for which the two-dimensional topology of pixels was not known, could we discover it automatically and exploit it? For example suppose that the pixels had been permuted in a fixed but unknown way, could we recover the relative two-dimensional location of pixels on images? The surprising result presented here is that not only the answer is yes but that about as few as a thousand images are enough to approximately recover the relative locations of about a thousand pixels. This is achieved using a manifold learning algorithm applied to pixels associated with a measure of distributional similarity between pixel intensities. We compare different topologyextraction approaches and show how having the two-dimensional topology can be exploited.}
3550 }
3551
3552 @INPROCEEDINGS{NIPS21,
3553 editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
3554 title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
3555 booktitle = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
3556 year = {-1},
3557 publisher = {Nips Foundation (http://books.nips.cc)}
3558 }
3559
3560 @INPROCEEDINGS{NIPS21-short,
3561 editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
3562 title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
3563 booktitle = {NIPS 21},
3564 year = {-1},
3565 publisher = {Nips Foundation (http://books.nips.cc)}
3566 }
3567
3568
3569 @INPROCEEDINGS{NIPS22-short,
3570 editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
3571 title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
3572 booktitle = {NIPS 22},
3573 year = {-1}
3574 }
3575
3576
3577 @INPROCEEDINGS{NIPS3,
3578 editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
3579 title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
3580 booktitle = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
3581 year = {-1},
3582 publisher = {Morgan Kaufmann}
3583 }
3584
3585 @INPROCEEDINGS{NIPS3-short,
3586 editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
3587 title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
3588 booktitle = {NIPS 3},
3589 year = {-1},
3590 publisher = {Morgan Kaufmann}
3591 }
3592
3593
3594 @INPROCEEDINGS{NIPS4-short,
3595 editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
3596 title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
3597 booktitle = {NIPS 4},
3598 year = {-1},
3599 publisher = {Morgan Kaufmann}
3600 }
3601
3602
3603 @INPROCEEDINGS{NIPS5,
3604 editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
3605 title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
3606 booktitle = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
3607 year = {-1},
3608 publisher = {Morgan Kaufmann}
3609 }
3610
3611 @INPROCEEDINGS{NIPS5-short,
3612 editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
3613 title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
3614 booktitle = {NIPS 5},
3615 year = {-1},
3616 publisher = {Morgan Kaufmann}
3617 }
3618
3619
3620 @INPROCEEDINGS{NIPS6-short,
3621 editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
3622 title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
3623 booktitle = {NIPS 6},
3624 year = {-1},
3625 publisher = {MIT Press}
3626 }
3627
3628
3629 @INPROCEEDINGS{NIPS7-short,
3630 editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
3631 title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
3632 booktitle = {NIPS 7},
3633 year = {-1},
3634 publisher = {MIT Press}
3635 }
3636
3637
3638 @INPROCEEDINGS{NIPS8-short,
3639 editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
3640 title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
3641 booktitle = {NIPS 8},
3642 year = {-1},
3643 publisher = {MIT Press}
3644 }
3645
3646
3647 @INPROCEEDINGS{NIPS9-short,
3648 editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
3649 title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
3650 booktitle = {NIPS 9},
3651 year = {-1},
3652 publisher = {MIT Press}
3653 }
3654
3655
3656 @INPROCEEDINGS{nnlm:2001:nips,
3657 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
3658 title = {A Neural Probabilistic Language Model},
3659 year = {2001},
3660 crossref = {NIPS13-shorter},
3661 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words. This is intrinsically difficult because of the curse of dimensionality: we propose to fight it with its own weapons. In the proposed approach one learns simultaneously (1) a distributed representation for each word (i.e. a similarity between words) along with (2) the probability function for word sequences, expressed with these representations. Generalization is obtained because a sequence of words that
3662 has never been seen before gets high probability if it is made of words that are similar to words forming an already seen sentence. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model.},
3663 topics={Markov,Unsupervised,Language},cat={C},
3664 }
3665
3666 @INPROCEEDINGS{nsvn:2000:ijcnn,
3667 author = {Vincent, Pascal and Bengio, Yoshua},
3668 title = {A Neural Support Vector Network Architecture with Adaptive Kernels},
3669 booktitle = {International Joint Conference on Neural Networks 2000},
3670 volume = {V},
3671 year = {2000},
3672 pages = {187--192},
3673 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nsvn.pdf},
3674 abstract = {In the Support Vector Machines ({SVM}) framework, the positive-definite kernel can be seen as representing a fixed similarity measure between two patterns, and a discriminant function is obtained by taking a linear combination of the kernels computed at training examples called support vectors. Here we investigate learning architectures in which the kernel functions can be replaced by more general similarity measures that can have arbitrary internal parameters. The training criterion used in {SVM}s is not appropriate for this purpose so we adopt the simple criterion that is generally used when training neural networks for classification tasks. Several experiments are performed which show that such Neural Support Vector Networks perform similarly to {SVM}s while requiring significantly fewer support vectors, even when the similarity measure has no internal parameters.},
3675 topics={Kernel},cat={C},
3676 }
3677
3678 @INPROCEEDINGS{Ouimet+al-2005,
3679 author = {Ouimet, Marie and Bengio, Yoshua},
3680 editor = {Cowell, Robert G. and Ghahramani, Zoubin},
3681 title = {Greedy Spectral Embedding},
3682 booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
3683 year = {2005},
3684 pages = {253--260},
3685 publisher = {Society for Artificial Intelligence and Statistics},
3686 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/greedy-kernel-aistats05.pdf},
3687 abstract = {Spectral dimensionality reduction methods and spectral clustering methods require computation of the principal eigenvectors of an n X n matrix where n is the number of examples. Following up on previously proposed techniques to speed-up kernel methods by focusing on a subset of m examples, we study a greedy selection procedure for this subset, based on the feature space distance between a candidate example and the span of the previously chosen ones. In the case of kernel {PCA} or spectral clustering this reduces computation to O(m^2 n). For the same computational complexity, we can also compute the feature space projection of the non-selected examples on the subspace spanned by the selected examples, to estimate the embedding function based on all the data, which yields considerably better estimation of the embedding function. This algorithm can be formulated in an online setting and we can bound the error on the approximation of the Gram matrix.},
3688 topics={HighDimensional,kenel},cat={C},
3689 }
3690
3691 @MASTERSTHESIS{Ouimet-Msc-2004,
3692 author = {Ouimet, Marie},
3693 keywords = {algorithmes voraces., apprentissage non-supervis{\'{e}}, m{\'{e}}thodes spectrales, noyaux, r{\'{e}}duction de dimensionnalit{\'{e}}},
3694 title = {R{\'{e}}duction de dimensionnalit{\'{e}} non lin{\'{e}}aire et vorace},
3695 year = {2004},
3696 school = {Universit{\'{e}} de Montr{\'{e}}al},
3697 abstract = {Les m{\'{e}}thodes spectrales de r{\'{e}}duction de dimensionnalit{\'{e}} et les m{\'{e}}thodes de segmentation spectrale exigent le calcul des vecteurs propres principaux d'une matrice de taille n x n o{\`{u}} n est le nombre d'exemples. Des techniques ont {\'{e}}t{\'{e}} propos{\'{e}}es dans la litt{\'{e}}rature pour acc{\'{e}}l{\'{e}}rer les m{\'{e}}thodes {\`{a}} noyau en se concentrant sur un sous-ensemble de m exemples. Nous proposons une proc{\'{e}}dure vorace pour la s{\'{e}}lection de ce sous-ensemble, qui est bas{\'{e}}e sur la distance dans l'espace des caract{\`{e}}ristiques entre un exemple candidat et le sous-espace g{\'{e}}n{\'{e}}r{\'{e}} par les exemples pr{\'{e}}c{\'{e}}demment choisis. Dans le cas de l'ACP {\`{a}} noyau ou de la segmentation spectrale, nous obtenons un algorithme en O(m*m*n), o{\`{u}} m << n, qui, contrairement aux techniques pr{\'{e}}c{\'{e}}demment propos{\'{e}}es, peut se formuler de fa{\c c}on en-ligne. Pour la m{\^{e}}me complexit{\'{e}} en temps, nous pouvons {\'{e}}galement calculer la projection des exemples non choisis sur le sous-espace engendr{\'{e}} par les exemples choisis dans l'espace des caract{\'{e}}ristiques. En repr{\'{e}}sentant ainsi les exemples par leur projection nous obtenons une approximation de plus faible rang de la matrice de Gram sur toutes les donn{\'{e}}es. Nous pouvons {\'{e}}galement borner l'erreur correspondant {\`{a}} cette approximation de la matrice de Gram.}
3698 }
3699
3700 @ARTICLE{paiement+bengio+eck:aij,
3701 author = {Paiement, Jean-Fran{\c c}ois and Bengio, Samy and Eck, Douglas},
3702 title = {Probabilistic Models for Melodic Prediction},
3703 journal = {Artificial Intelligence Journal},
3704 volume = {173},
3705 year = {2009},
3706 pages = {1266-1274},
3707 source={OwnPublication},
3708 sourcetype={Journal},
3709 }
3710
3711 @INPROCEEDINGS{paiement+eck+bengio+barber:icml2005,
3712 author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy and Barber, D.},
3713 title = {A graphical model for chord progressions embedded in a psychoacoustic space},
3714 year = {2005},
3715 pages = {641--648},
3716 publisher = {ACM Press},
3717 crossref = {ICML05},
3718 source={OwnPublication},
3719 sourcetype={Conference},
3720 }
3721
3722 @INPROCEEDINGS{paiement+eck+bengio:ccai2006,
3723 author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
3724 editor = {Lamontagne, Luc and Marchand, Mario},
3725 title = {Probabilistic Melodic Harmonization},
3726 booktitle = {Canadian Conference on AI},
3727 series = {Lecture Notes in Computer Science},
3728 volume = {4013},
3729 year = {2006},
3730 pages = {218-229},
3731 publisher = {Springer},
3732 source={OwnPublication},
3733 sourcetype={Conference},
3734 }
3735
3736 @INPROCEEDINGS{paiement+eck+bengio:ismir2005,
3737 author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
3738 title = {A Probabilistic Model for Chord Progressions},
3739 booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
3740 year = {2005},
3741 pages = {312-319},
3742 source={OwnPublication},
3743 sourcetype={Conference},
3744 }
3745
3746 @INPROCEEDINGS{paiement+grandvalet+bengio+eck:icml2008,
3747 author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
3748 title = {A generative model for rhythms},
3749 year = {2008},
3750 pages = {},
3751 crossref = {ICML06-shorter},
3752 source={OwnPublication},
3753 sourcetype={Conference},
3754 }
3755
3756 @UNPUBLISHED{paiement+grandvalet+bengio+eck:nipsworkshop2007,
3757 author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
3758 title = {A generative model for rhythms},
3759 year = {2007},
3760 note = {NIPS 2007 Workshop on Music, Brain and Cognition},
3761 source={OwnPublication},
3762 sourcetype={Workshop},
3763 optkey={""},
3764 optmonth={""},
3765 optannote={""},
3766 }
3767
3768 @MASTERSTHESIS{Paiement-Msc-2003,
3769 author = {Paiement, Jean-Fran{\c c}ois},
3770 keywords = {algorithmes, apprentissage, apprentissage non supervis{\'{e}}, forage de donn{\'{e}}es, noyaux, r{\'{e}}duction de dimensions, statistique, Statistiques},
3771 title = {G{\'{e}}n{\'{e}}ralisation d'algorithmes de r{\'{e}}duction de dimension},
3772 year = {2003},
3773 school = {Universit{\'{e}} de Montr{\'{e}}al},
3774 abstract = {On pr{\'{e}}sente tout d'abord la notion de vari{\'{e}}t{\'{e}} comme r{\'{e}}gion de faible dimension contenant des observations situ{\'{e}}es dans un espace de haute dimension. Cette d{\'{e}}finition justifie l'{\'{e}}laboration d'algorithmes permettant d'exprimer les donn{\'{e}}es dans un syst{\`{e}}me de coordonn{\'{e}}es de dimensions {\'{e}}gale {\`{a}} celle de la vari{\'{e}}t{\'{e}} sur laquelle les donn{\'{e}}es sont approximativement situ{\'{e}}es.
3775 La notion de noyau comme mesure de similarit{\'{e}} est par la suite formalis{\'{e}}e. On constate que l'application d'un noyau {\`{a}} deux observations correspond {\`{a}} l'{\'{e}}valuation d'un produit scalaire dans un espace de Hilbert appel{\'{e}} espace de caract{\'{e}}ristiques.
3776 Une m{\'{e}}thode de r{\'{e}}duction de dimension lin{\'{e}}raire est expos{\'{e}}e ainsi que ces limites. Des algorithmes non lin{\'{e}}raires de r{\'{e}}duction de dimension et de segmentation permettent de s'affranchir de ces limites. Ces derniers ne fournissent cependant pas d'extension directe {\`{a}} des points hors {\'{e}}chantillon.
3777 L'{\'{e}}tape fondamentale au sein des algorithmes pr{\'{e}}sent{\'{e}}s est la solution d'un syst{\`{e}}me de vecteurs propres d'une matrice sym{\'{e}}trique cr{\'{e}}{\'{e}}e {\`{a}} partir d'un noyau d{\'{e}}pendant des donn{\'{e}}es. On con{\c c}oit cd probl{\`{e}}me comme le fait de trouver les fonctions propres d'un op{\'{e}}rateur lin{\'{e}}aire d{\'{e}}fini {\`{a}} partir du m{\^{e}}me noyau. On utilise alors la formulation de Nystr{\"{o}}m, pr{\'{e}}sente dans l'espace en composantes principales {\`{a}} noyaux, afin de r{\'{e}}duire la dimension des points hors {\'{e}}chantillon sur la vase des plongements obtenus {\`{a}} l'aide des algorithmes d{\'{e}}j{\`{a}} mentionn{\'{e}}s.
3778 La qualit{\'{e}} de la projection g{\'{e}}n{\'{e}}r{\'{e}}e est compar{\'{e}}e {\`{a}} la perturbation intrins{\`{e}}que des algorithmes si on substitue certaine observations par d'autres tir{\'{e}}es de la m{\^{e}}me distribution.}
3779 }
3780
3781 @ARTICLE{perez+gers+schmidhuber+eck:2002,
3782 author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
3783 title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
3784 journal = {Neural Networks},
3785 volume = {16},
3786 number = {2},
3787 year = {2003},
3788 abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
3789 source={OwnPublication},
3790 sourcetype={Journal},
3791 }
3792
3793 @ARTICLE{perez+gers+schmidhuber+eck:2003,
3794 author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
3795 title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
3796 journal = {Neural Networks},
3797 volume = {16},
3798 number = {2},
3799 year = {2003},
3800 pages = {241--250},
3801 abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
3802 source={OwnPublication},
3803 sourcetype={Journal},
3804 }
3805
3806 @INPROCEEDINGS{perez+schmidhuber+gers+eck:icannB2002,
3807 author = {Perez-Ortiz, J. A. and Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
3808 editor = {Dorronsoro, J.},
3809 title = {Improving Long-Term Online Prediction with {Decoupled Extended Kalman Filters}},
3810 booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
3811 year = {2002},
3812 pages = {1055--1060},
3813 publisher = {Springer},
3814 abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) outperform traditional {RNN}s when dealing with sequences involving not only short-term but also long-term dependencies. The decoupled extended Kalman filter learning algorithm ({DEKF}) works well in online environments and reduces significantly the number of training steps when compared to the standard gradient-descent algorithms. Previous work on {LSTM}, however, has always used a form of gradient descent and has not focused on true online situations. Here we combine {LSTM} with {DEKF} and show that this new hybrid improves upon the original learning algorithm when applied to online processing.},
3815 source={OwnPublication},
3816 sourcetype={Conference},
3817 }
3818
3819 @TECHREPORT{Pigeon-Bengio-96-aH-TR,
3820 author = {Pigeon, Steven and Bengio, Yoshua},
3821 title = {A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols},
3822 number = {\#1081},
3823 year = {1997},
3824 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3825 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/HuffAdapt.pdf},
3826 abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithm implicitly assumes that either we have a small alphabet — quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbol, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the number of observed symbols.},
3827 topics={Compression},cat={T},
3828 }
3829
3830 @INPROCEEDINGS{Pigeon-dcc98,
3831 author = {Pigeon, Steven and Bengio, Yoshua},
3832 editor = {Society, {IEEE} Computer},
3833 title = {A Memory-Efficient Adaptive Huffman Coding Algorithm for Very Large Sets of Symbols},
3834 booktitle = {Data Compression Conference},
3835 year = {1998},
3836 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc98.pdf},
3837 abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithms implicitly assumes that either we have a small alphabet — quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to
3838 encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbols, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the size of the alphabet.},
3839 topics={Compression},cat={C},
3840 }
3841
3842 @INPROCEEDINGS{Pigeon-dcc99,
3843 author = {Pigeon, Steven and Bengio, Yoshua},
3844 editor = {Society, {IEEE} Computer},
3845 title = {Binary Pseudowavelets and Applications to Bilevel Image Processing},
3846 booktitle = {Data Compression Conference},
3847 year = {1999},
3848 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc99.pdf},
3849 abstract = {This paper shows the existance of binary pseudowavelets, bases on the binary domain that exhibit some of the properties of wavelets, such as multiresolution reconstruction and compact support. The binary pseudowavelets are defined on _n (binary vectors of length n) and are operated upon with the binary operators logical and and exclusive or. The forward transform, or analysis, is the decomposition of a binary vector into its constituant binary pseudowavelets. Binary pseudowavelets allow multiresolution, progressive reconstruction of binary vectors by using progressively more coefficients in the inverse transform. Binary pseudowavelets bases, being sparse matrices, also provide for fast transforms; moreover pseudowavelets rely on hardware-friendly operations for efficient software and hardware implementation.},
3850 topics={Compression},cat={C},
3851 }
3852
3853 @TECHREPORT{Pigeon-Huffman-TR98,
3854 author = {Pigeon, Steven and Bengio, Yoshua},
3855 title = {A Memory-Efficient Adaptive Huffman Coding for Very Large Sets of Symbols revisited},
3856 number = {1095},
3857 year = {1998},
3858 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3859 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TechRep_AdaptativeHuffman2.pdf},
3860 abstract = {While algorithm M (presented in A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols, by Steven Pigeon & Yoshua Bengio, Universit{\'{e}} de Montr{\'{e}}al technical report #1081 [1]) converges to the entropy of the signal, it also assumes that the characteristics of the signal are stationary, that is, that they do not change over time and that successive adjustments, ever decreasing in their magnitude, will lead to a reasonable approximation of the entropy. While this is true for some data, it is clearly not true for some other. We present here a modification of the M algorithm that allows negative updates. Negative updates are used to maintain a window over the source. Symbols enter the window at its right and will leave it at its left, after w steps (the window width). The algorithm presented here allows us to update correctly the weights of the symbols in the symbol tree. Here, we will also have negative migration or demotion, while we only had positive migration or promotion in M. This algorithm will be called M+.},
3861 topics={Compression},cat={T},
3862 }
3863
3864 @PHDTHESIS{Pigeon-Phd-2001,
3865 author = {Pigeon, Steven},
3866 keywords = {algorithmes, codes adaptatifs, codes de Golomb, codes universels, Compression de donn{\'{e}}es, compression LZ78, LZW, ondelettes, pseudo-ondelettes},
3867 title = {Contributions {\`{a}} la compression de donn{\'{e}}es},
3868 year = {2001},
3869 school = {Universit{\'{e}} de Montr{\'{e}}al},
3870 abstract = {L'objectif de cette th{\`{e}}se est de pr{\'{e}}senter nos contributions {\`{a}} la compression de donn{\'{e}}es. Le texte entier n'est pas consacr{\'{e}} {\`{a}} nos seules contributions. Une large part est consacr{\'{e}}e au mat{\'{e}}riel introductif et {\`{a}} la recension de litt{\'{e}}rature sur les sujets qui sont pertinents {\`{a}} nos contributions. Le premier chapitre de contribution, le chapitre "Contribution au codage des entiers" se concentre sur le probl{\`{e}}me de la g{\'{e}}n{\'{e}}ration de codes efficaces pour les entiers. Le chapitre "Codage Huffman Adaptatif" pr{\'{e}}sente deux nouveaux algorithmes pour la g{\'{e}}n{\'{e}}ration dynamique de codes structur{\'{e}}s en arbre, c'est-{\`{a}}-dire des codes de type Huffman. Le chapitre "LZW avec une perte" explore le probl{\`{e}}me de la compression d'images comportant un petit nombre de couleurs distinctes et propose une extension avec perte d'un algorithme originalement sans perte, LZW. Enfin, le dernier chapitre de contribution, le chapitre "Les pseudo-ondelettes binaires" pr{\'{e}}sente une solution original au probl{\`{e}}me de l'analyse multir{\'{e}}solution des images monochromes, c'est-{\`{a}}-dire des images n'ayant que deux couleurs, conventionnellement noir et blanc. Ce type d'image correspond par exemple aux images textuelles telle que produites par un processus de transmission de type facsimil{\'{e}}.}
3871 }
3872
3873 @ARTICLE{Pigeon98,
3874 author = {Pigeon, Steven and Bengio, Yoshua},
3875 title = {Memory-Efficient Adaptive Huffman Coding},
3876 journal = {Dr. Dobb's Journal},
3877 volume = {290},
3878 year = {1998},
3879 pages = {131--135},
3880 topics={Compression},cat={J},
3881 }
3882
3883 @INPROCEEDINGS{probnn:2000:ijcnn,
3884 author = {Bengio, Yoshua},
3885 title = {Probabilistic Neural Network Models for Sequential Data},
3886 booktitle = {International Joint Conference on Neural Networks 2000},
3887 volume = {V},
3888 year = {2000},
3889 pages = {79--84},
3890 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/81_01.PDF},
3891 abstract = {It has already been shown how Artificial Neural Networks ({ANN}s) can be incorporated into probabilistic models.
3892 In this paper we review some of the approaches which have been proposed to incorporate them into probabilistic
3893 models of sequential data, such as Hidden {Markov} Models ({HMM}s). We also discuss new developments and new
3894 ideas in this area, in particular how {ANN}s can be used to model high-dimensional discrete and continuous data to
3895 deal with the curse of dimensionality, and how the ideas proposed in these models could be applied to statistical
3896 language modeling to represent longer-term context than allowed by trigram models, while keeping word-order
3897 information.},
3898 topics={Markov},cat={C},
3899 }
3900
3901 @UNPUBLISHED{pugin+burgoyne+eck+fujinaga:nipsworkshop2007,
3902 author = {Pugin, L. and Burgoyne, J. A. and Eck, Douglas and Fujinaga, I.},
3903 title = {Book-adaptive and book-dependant models to accelerate digitalization of early music},
3904 year = {2007},
3905 note = {NIPS 2007 Workshop on Music, Brain and Cognition},
3906 source={OwnPublication},
3907 sourcetype={Workshop},
3908 optkey={""},
3909 optmonth={""},
3910 optannote={""},
3911 }
3912
3913 @INPROCEEDINGS{Rahim-97,
3914 author = {Rahim, Mazin and Bengio, Yoshua and {LeCun}, Yann},
3915 title = {Discriminative feature and model design for automatic speech recognition},
3916 booktitle = {Proceedings of Eurospeech 1997},
3917 year = {1997},
3918 pages = {75--78},
3919 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rahim-bengio-lecun-97.ps.gz},
3920 abstract = {A system for discriminative feature and model design is presented for automatic speech recognition. Training based on minimum classification error with a single objective function is applied for designing a set of parallel networks performing feature transformation and a set of hidden {Markov} models performing speech recognition. This paper compares the use of linear and non-linear functional transformations when applied to conventional recognition features, such as spectrum or cepstrum. It also provides a framework for integrated feature and model training when using class-specific transformations. Experimental results on telephone-based connected digit recognition are presented.},
3921 topics={Speech},cat={C},
3922 }
3923
3924 @ARTICLE{Rivest-2009,
3925 author = {Rivest, Fran{\c c}ois and Kalaska, John and Bengio, Yoshua},
3926 title = {Alternative Time Representations in Dopamine Models},
3927 journal = {Journal of Computational Neuroscience},
3928 volume = {28},
3929 number = {1},
3930 year = {2009},
3931 pages = {107--130},
3932 abstract = {Dopaminergic neuron activity has been modeled during learning and appetitive behavior, most commonly using the temporal-difference (TD) algorithm. However, a proper representation of elapsed time and of the exact task is usually required for the model to work. Most models use timing elements such as delay-line representations of time that are not biologically realistic for intervals in the range of seconds. The interval-timing literature provides several alternatives. One of them is that timing could emerge from general network dynamics, instead of coming from a dedicated circuit. Here, we present a general rate-based learning model based on long short-term memory ({LSTM}) networks that learns a time representation when needed. Using a na{\"{\i}}ve network learning its environment in conjunction with TD, we reproduce dopamine activity in appetitive trace conditioning with a constant CS-US interval, including probe trials with unexpected delays. The proposed model learns a representation of the environment dynamics in an adaptive biologically plausible framework, without recourse to delay lines or other special-purpose circuits. Instead, the model predicts that the task-dependent representation of time is learned by experience, is encoded in ramp-like changes in single-neuron activity distributed across small neural networks, and reflects a temporal integration mechanism resulting from the inherent dynamics of recurrent loops within the network. The model also reproduces the known finding that trace conditioning is more difficult than delay conditioning and that the learned representation of the task can be highly dependent on the types of trials experienced during training. Finally, it suggests that the phasic dopaminergic signal could facilitate learning in the cortex.}
3933 }
3934
3935 @ARTICLE{schmidhuber+gers+eck:2002,
3936 author = {Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
3937 title = {Learning Nonregular Languages: A Comparison of Simple Recurrent Networks and {LSTM}},
3938 journal = {Neural Computation},
3939 volume = {14},
3940 number = {9},
3941 year = {2002},
3942 pages = {2039--2041},
3943 abstract = {In response to Rodriguez' recent article (Rodriguez 2001) we compare the performance of simple recurrent nets and {\em ``Long Short-Term Memory''} ({LSTM}) recurrent nets on context-free and context-sensitive languages.},
3944 source={OwnPublication},
3945 sourcetype={Journal},
3946 }
3947
3948 @TECHREPORT{Schwenk-Bengio-97-TR,
3949 author = {Schwenk, Holger and Bengio, Yoshua},
3950 title = {Adaptive Boosting of Neural Networks for Character Recognition},
3951 number = {\#1072},
3952 year = {1997},
3953 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3954 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/AdaBoostTR.pdf},
3955 abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
3956 topics={Boosting,Speech},cat={T},
3957 }
3958
3959 @INPROCEEDINGS{Schwenk-nips10,
3960 author = {Schwenk, Holger and Bengio, Yoshua},
3961 title = {Training Methods for Adaptive Boosting of Neural Networks for Character Recognition},
3962 year = {1998},
3963 crossref = {NIPS10-shorter},
3964 abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
3965 topics={Boosting,Speech},cat={C},
3966 }
3967
3968 @ARTICLE{Schwenk2000,
3969 author = {Schwenk, Holger and Bengio, Yoshua},
3970 title = {Boosting Neural Networks},
3971 journal = {Neural Computation},
3972 volume = {12},
3973 number = {8},
3974 year = {2000},
3975 pages = {1869--1887},
3976 abstract = {“Boosting” is a general method for improving the performance of learning algorithms. A recently proposed boosting algorithm is AdaBoost. It has been applied with great success to several benchmark machine learning problems using mainly decision trees as base classifiers. In this paper we investigate whether AdaBoost also works as well with neural networks, and we discuss the advantages and drawbacks of di_erent versions of the AdaBoost algorithm. In particular, we compare training methods based on sampling the training set and weighting the cost function. The results suggest that random resampling of the training data is not the main explanation of the success of the improvements brought by AdaBoost. This is in contrast to Bagging which directly aims at reducing variance and for which random resampling is essential to obtain the reduction in generalization error. Our system achieves about 1.4\% error on a data set of online handwritten digits from more than 200 writers. A boosted multi-layer network achieved 1.5\% error on the UCI Letters and 8.1\% error on the UCI satellite data set, which is significantly better than boosted decision trees.},
3977 topics={Boosting},cat={J},
3978 }
3979
3980 @INPROCEEDINGS{secondorder:2001:nips,
3981 author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
3982 title = {Incorporating Second-Order Functional Knowledge for Better Option Pricing},
3983 year = {2001},
3984 crossref = {NIPS13-shorter},
3985 abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in two of its arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of continuous functions with these and other properties. We apply this new class of functions to the task of modeling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.},
3986 topics={Finance},cat={C},
3987 }
3988
3989 @ARTICLE{Sonnenburg+al-2007,
3990 author = {Sonnenburg, Soeren and et al. and Vincent, Pascal},
3991 title = {The Need for Open Source Software in Machine Learning.},
3992 year = {2007},
3993 note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
3994 crossref = {JMLR-shorter},
3995 abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.
3996
3997 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
3998 }
3999
4000 @ARTICLE{Takeuchi-Bengio-Kanamori-2002,
4001 author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
4002 title = {Robust Regression with Asymmetric Heavy-Tail Noise Distributions},
4003 journal = {Neural Computation},
4004 volume = {14},
4005 number = {10},
4006 year = {2002},
4007 pages = {2469--2496},
4008 abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they down-weight the influence of so-called outliers. When the noise distribution is assymetric these methods yield biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regession that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust etimators of the regression), and to lern a few remaining parameters to combbine and correct these stimators, to unbiasedly minimize the average squared error. Theoritical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
4009 topics={Mining},cat={J},
4010 }
4011
4012 @ARTICLE{Thierry+al-2008,
4013 author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul},
4014 title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases},
4015 journal = {Journal of New Music Research},
4016 year = {2008},
4017 abstract = {Social tags are user-generated keywords associated with some resource on the Web. In the case of music, social tags have become an important component of "Web 2.0" recommender systems, allowing users to generate playlists based on use-dependent terms such as chill or jogging that have been applied to particular songs. In this paper, we propose a method for predicting these social tags directly from MP3 files. Using a set of 360 classifiers trained using the online ensemble learning algorithm FilterBoost, we map audio features onto social tags collected from the Web. The resulting automatic tags (or autotags) furnish information about music that is otherwise untagged or poorly tagged, allowing for insertion of previously unheard music into a social recommender. This avoids the “cold-start problem” common in such systems. Autotags can also be used to smooth the tag space from which similarities and
4018 recommendations are made by providing a set of comparable baseline tags for all tracks in a recommender system. Because the words we learn are the same as those used by people who label their music collections, it is easy to integrate our predictions into existing similarity and prediction methods based on web data.}
4019 }
4020
4021 @ARTICLE{Thivierge+al-2007,
4022 author = {Thivierge, J. -P. and Rivest, Fran{\c c}ois and Monchi, O},
4023 title = {Spiking Neurons, Dopamine, and Plasticity: Timing Is Everything, But Concentration Also Matters},
4024 journal = {Synapse},
4025 volume = {61},
4026 year = {2007},
4027 pages = {375-390},
4028 abstract = {While both dopamine (DA) fluctuations and spike-timing-dependent plasticity (STDP) are known to influence long-term corticostriatal plasticity, little attention has been devoted to the interaction between these two fundamental mechanisms. Here, a theoretical framework is proposed to account for experimental results specifying the role of presynaptic activation, postsynaptic activation, and concentrations of extracellular DA in synaptic plasticity. Our starting point was an explicitly-implemented multiplicative rule linking STDP to Michaelis-Menton equations that models the dynamics of extracellular DA fluctuations. This rule captures a wide range of results on conditions leading to long-term potentiation and depression in simulations that manipulate the frequency of induced corticostriatal stimulation and DA release. A well-documented biphasic function relating DA concentrations to synaptic plasticity emerges naturally from simulations involving a multiplicative rule linking DA and neural activity. This biphasic function is found consistently across different neural coding schemes employed (voltage-based vs. spike-based models). By comparison, an additive rule fails to capture these results. The proposed framework is the first to generate testable predictions on the dual influence of DA concentrations and STDP on long-term plasticity, suggesting a way in which the biphasic influence of DA concentrations can modulate the direction and magnitude of change induced by STDP, and raising the possibility that DA concentrations may inverse the LTP/LTD components of the STDP rule.}
4029 }
4030
4031 @TECHREPORT{tonga-tr,
4032 author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
4033 title = {Topmoumoute online natural gradient algorithm},
4034 number = {1299},
4035 year = {2007},
4036 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4037 abstract = {Guided by the goal of obtaining an optimization algorithm that is
4038 both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
4039 }
4040
4041 @TECHREPORT{TR1197,
4042 author = {Vincent, Pascal and Bengio, Yoshua},
4043 title = {K-Local Hyperplane and Convex Distance Nearest Neighbor Algorithms},
4044 number = {1197},
4045 year = {2001},
4046 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4047 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1197.pdf},
4048 abstract = {Guided by an initial idea of building a complex (non linear) decision surface with maximal local margin in input space, we give a possible geometrical intuition as to why K-Nearest Neighbor ({KNN}) algorithms often perform more poorly than {SVM}s on classification tasks. We then propose modified K-Nearest Neighbor algorithms to overcome the perceived problem. The approach is similar in spirit to Tangent Distance, but with invariances inferred from the local neighborhood rather than prior knowledge. Experimental results on real world classification tasks suggest that the modified {KNN} algorithms often give a dramatic improvement over standard {KNN} and perform as well or better than {SVM}s.},
4049 topics={Kernel},cat={T},
4050 }
4051
4052 @TECHREPORT{TR1198,
4053 author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
4054 title = {Robust Regression with Asymmetric Heavy-Tail Noise},
4055 number = {1198},
4056 year = {2001},
4057 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4058 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1198.pdf},
4059 abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they downweight the influence of so-called outliers. When the noise distribution is asymmetric these methods yield strongly biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regression that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust estimators of the regression), and to learn a few remaining parameters to combine and correct these estimators, to minimize the average squared error. Theoretical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
4060 topics={Mining},cat={T},
4061 }
4062
4063 @TECHREPORT{TR1199,
4064 author = {Chapados, Nicolas and Bengio, Yoshua and Vincent, Pascal and Ghosn, Joumana and Dugas, Charles and Takeuchi, Ichiro and Meng, Linyan},
4065 title = {Estimating Car Insurance Premia: a Case Study in High-Dimensional Data Inference},
4066 number = {1199},
4067 year = {2001},
4068 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4069 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1199.pdf},
4070 abstract = {Estimating insurance premia from data is a difficult regression problem for several reasons: the large number of variables, many of which are discrete, and the very peculiar shape of the noise distribution, asymmetric with fat tails, with a large majority zeros and a few unreliable and very large values. We introduce a methodology for estimating insurance premia that has been applied in the car insurance industry. It is based on mixtures of specialized neural networks, in order to reduce the effect of outliers on the estimation. Statistical comparisons with several different alternatives, including decision trees and generalized linear models show that the proposed method is significantly more precise, allowing to identify the least and most risky contracts, and reducing the median premium by charging more to the most risky customers.},
4071 topics={HighDimensional,Mining},cat={T},
4072 }
4073
4074 @TECHREPORT{TR1200,
4075 author = {Bengio, Yoshua and Chapados, Nicolas},
4076 title = {Extending Metric-Based Model Selection and Regularization in the Absence of Unlabeled Data},
4077 number = {1200},
4078 year = {2001},
4079 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4080 url = {http://www.iro.umontreal.ca/lisa/pointeurs/TR1200.ps},
4081 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over all the alternatives tried (including cross-validation). However, these methods require a large set of unlabeled data, which is not always available in many applications. In this paper we extend these methods (TRI, ADJ and ADA) to the case where no unlabeled data is available. The extended methods (xTRI, xADJ, xADA) use a model of the input density directly estimated from the training set. The intuition is that the main reason why the above methods work well is that they make sure that the learned function behaves similarly on the training points and on “neighboring” points. The experiments are based on estimating a simple non-parametric density model. They show that the extended methods perform comparably to the originals even though no unlabeled data is used.},
4082 topics={ModelSelection,Finance},cat={T},
4083 }
4084
4085 @TECHREPORT{TR1215,
4086 author = {Bengio, Yoshua},
4087 title = {New Distributed Probabilistic Language Models},
4088 number = {1215},
4089 year = {2002},
4090 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4091 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1215.ps},
4092 abstract = {Our previous work on statistical modeling introduced the use of probabilistic feedforward neural networks with shared parameters in order to help dealing with the curse of dimensionality. This work started with the motivation to speed up the above model and to take advantage of prior knowledge e.g., in WordNet or in syntactically labeled data sets, and to better deal with polysemy. With the objective of reaching these goals, we present here a series of new statistical language models, most of which are yet untested.},
4093 topics={Markov,Language,Unsupervised},cat={T},
4094 }
4095
4096 @TECHREPORT{TR1216,
4097 author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
4098 title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
4099 number = {1216},
4100 year = {2002},
4101 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4102 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1216.ps},
4103 abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we proposed and evaluate sampling-based methods which require network passes only for the observed “positive example” and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.},
4104 topics={Markov,Language,Unsupervised},cat={T},
4105 }
4106
4107 @TECHREPORT{TR1231,
4108 author = {Bengio, Yoshua and Kermorvant, Christopher},
4109 title = {Extracting Hidden Sense Probabilities from Bitexts},
4110 number = {1231},
4111 year = {2003},
4112 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4113 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1231.pdf},
4114 abstract = {We propose a probabilistic model that is inspired by Diab & Resnik’s algorithm to extract disambiguation information from aligned bilingual texts. Like Diab & Resnik’s, the proposed model uses WordNet and the fact that word ambiguities are not always the same in the two languages. The generative model introduces a dependency between two translated words through a common ancestor inWordNet’s ontology. Unlike Diab & Resnik’s algorithm it does not suppose that the translation in the source language has a single meaning.},
4115 topics={Language},cat={T},
4116 }
4117
4118 @TECHREPORT{TR1232,
4119 author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois},
4120 title = {Learning Eigenfunctions of Similarity: Linking Spectral Clustering and Kernel {PCA}},
4121 number = {1232},
4122 year = {2003},
4123 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4124 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1232.pdf},
4125 abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a Hilbert space whose inner product is defined with respect to a density model. This suggests a new approach to unsupervised learning in which abstractions (such as manifolds and clusters) that represent the main features of the data density are extracted. Abstractions discovered at one level can be used to build higher-level abstractions. This paper also discusses how these abstractions can be used to recover a quantitative model of the input density, which is at least useful for evaluative and comparative purposes.},
4126 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4127 }
4128
4129 @TECHREPORT{TR1234,
4130 author = {Bengio, Yoshua and Grandvalet, Yves},
4131 title = {No Unbiased Estimator of the Variance of K-Fold Cros-Validation},
4132 number = {1234},
4133 year = {2003},
4134 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4135 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1234.pdf},
4136 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important for them to also estimate the uncertainty around the error (or error difference) estimate. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make na{\"{\i}}ve estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
4137 topics={Comparative},cat={T},
4138 }
4139
4140 @TECHREPORT{tr1238,
4141 author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal},
4142 title = {Out-of-Sample Extensions for {LLE}, {I}somap, {MDS}, {E}igenmaps, and Spectral Clustering},
4143 number = {1238},
4144 year = {2003},
4145 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4146 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1238.pdf},
4147 abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides algorithms for such an extension for Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (all algorithms which provide lower-dimensional embedding for dimensionality reduction) as well as for Spectral Clustering (which performs non-Gaussian clustering). These extensions stem from a unified framework in which these algorithms are seen as learning eigenfunctions of a kernel. {LLE} and Isomap pose special challenges as the kernel is training-data dependent. Numerical experiments on real data show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms to the choice of training data.},
4148 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4149 }
4150
4151 @TECHREPORT{tr1239,
4152 author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois and Delalleau, Olivier and Ouimet, Marie and Le Roux, Nicolas},
4153 title = {Spectral Clustering and Kernel {PCA} are Learning Eigenfunctions},
4154 number = {1239},
4155 year = {2003},
4156 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4157 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1239.pdf},
4158 abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a function space whose scalar product is defined with respect to a density model. This defines a natural mapping for new data points, for methods that only provided an embedding, such as spectral clustering and Laplacian eigenmaps. The analysis hinges on a notion of generalization for embedding algorithms based on the estimation of underlying eigenfunctions, and suggests ways to improve this generalization by smoothing the data empirical distribution.},
4159 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4160 }
4161
4162 @TECHREPORT{tr1240,
4163 author = {Vincent, Pascal and Bengio, Yoshua},
4164 title = {Locally Weighted Full Covariance Gaussian Density Estimation},
4165 number = {1240},
4166 year = {2003},
4167 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4168 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1240.pdf},
4169 abstract = {We describe an interesting application of the principle of local learning to density estimation. Locally weighted fitting of a Gaussian with a regularized full covariance matrix yields a density estimator which displays improved behavior in the case where much of the probability mass is concentrated along a low dimensional manifold. While the proposed estimator is not guaranteed to integrate to 1 with a finite sample size, we prove asymptotic convergence to the true density. Experimental results illustrating the advantages of this estimator over classic non-parametric estimators are presented.},
4170 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4171 }
4172
4173 @TECHREPORT{tr1247,
4174 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
4175 title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
4176 number = {1247},
4177 year = {2004},
4178 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4179 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1247.pdf},
4180 abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed non-parametric algorithms which provide an estimated continuous label for the given unlabeled examples. It extends them to function induction algorithms that correspond to the minimization of a regularization criterion applied to an out-of-sample example, and happens to have the form of a Parzen windows regressor. The advantage of the extension is that it allows predicting the label for a new example without having to solve again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Experiments show that the extension works well, in the sense of predicting a label close to the one that would have been obtained if the test example had been included in the unlabeled set. This relatively efficient function induction procedure can also be used when n is large to approximate the solution by writing it only in terms of a kernel expansion with m << n terms, and reducing the linear system to m equations in m unknowns.},
4181 topics={Kernel,Unsupervised},cat={T},
4182 }
4183
4184 @TECHREPORT{tr1250,
4185 author = {Bengio, Yoshua and Monperrus, Martin},
4186 title = {Discovering shared structure in manifold learning},
4187 number = {1250},
4188 year = {2004},
4189 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4190 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr-tangent.pdf},
4191 abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local will suffer from at least four generic problems associated with (1) noise in the data, (2) curvature of the manifold, (3) dimensionality of the manifold, and (4) the presence of many manifolds with little data per manifold. This analysis suggests non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented. The function has parameters that are shared across space rather than estimated based on the local neighborhood, as in current non-parametric manifold learning algorithms. The results show clearly the advantages of this approach with respect to local manifold learning algorithms.},
4192 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4193 }
4194
4195 @TECHREPORT{tr1252,
4196 author = {Bengio, Yoshua and Larochelle, Hugo},
4197 title = {Implantation et analyse d'un mod{\`{e}}le graphique {\`{a}} entra{\^{\i}}nement supervis{\'{e}}, semi-supervis{\'{e}} et non-supervis{\'{e}} pour la d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique},
4198 number = {1252},
4199 year = {2004},
4200 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4201 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1252.pdf},
4202 abstract = {La d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique est un sujet qui suscite beaucoup d’int{\'{e}}r{\^{e}}t dans la communaut{\'{e}} scientifique en apprentissage automatique. Quoique cette t{\^{a}}che ait {\'{e}}t{\'{e}} abord{\'{e}}e depuis les d{\'{e}}buts du traitement automatique de la langue, peu de progr{\`{e}}s ont {\'{e}}t{\'{e}} accomplis jusqu’{\`{a}} maintenant. Nous pr{\'{e}}sentons ici une application de d{\'{e}}sambigu{\"{\i}}sation bas{\'{e}}e sur un mod{\`{e}}le graphique probabiliste. Ce mod{\`{e}}le a {\'{e}}t{\'{e}} appris sur des donn{\'{e}}es {\'{e}}tiquet{\'{e}}es, non-{\'{e}}tiquet{\'{e}}es, et sur la hi{\'{e}}rarchie WordNet. Avec peu d’examples d’apprentissage, ses performances sont comparables {\`{a}} celles de l’algorithme de Bayes na{\"{\i}}f. Il pourrait {\'{e}}ventuellement {\^{e}}tre adapt{\'{e}} {\`{a}} des corpus bi-textes.},
4203 topics={Unsupervised,Language},cat={T},
4204 }
4205
4206 @TECHREPORT{tr1281,
4207 author = {Le Roux, Nicolas and Bengio, Yoshua},
4208 title = {Continuous Neural Networks},
4209 number = {1281},
4210 year = {2006},
4211 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4212 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/continuous_nnet_tr1281.pdf},
4213 abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several
4214 ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based
4215 learning. While having the same number of parameters as an ordinary neural network, its internal
4216 structure suggests that it can represent some smooth functions much more compactly. Under mild
4217 assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this
4218 parametrization may help reducing the problem of saturation of the neurons. In a second approach, the
4219 input-to-hidden weights are fully non-parametric, yielding a kernel machine for which we demonstrate
4220 a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free
4221 and still generalizes in spite of an absence of explicit regularization.},
4222 cat={T},topics={Kernel,HighDimensional},
4223 }
4224
4225 @TECHREPORT{tr1282,
4226 author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
4227 title = {Greedy Layer-Wise Training of Deep Networks},
4228 number = {1282},
4229 year = {2006},
4230 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4231 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dbn_supervised_tr1282.pdf},
4232 abstract = {Deep multi-layer neural networks have many levels of non-linearities, which allows them to potentially
4233 represent very compactly highly non-linear and highly-varying functions. However, until recently it
4234 was not clear how to train such deep networks, since gradient-based optimization starting from random
4235 initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy
4236 layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with
4237 many layers of hidden causal variables. In the context of the above optimization problem, we study
4238 this algorithm empirically and explore variants to better understand its success and extend it to cases
4239 where the inputs are continuous or where the structure of the input distribution is not revealing enough
4240 about the variable to be predicted in a supervised task.},
4241 cat={T},topics={HighDimensional,Unsupervised},
4242 }
4243
4244 @TECHREPORT{tr1283,
4245 author = {Carreau, Julie and Bengio, Yoshua},
4246 title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tail Data},
4247 number = {1283},
4248 year = {2006},
4249 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4250 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/fat_tails_tr1283.pdf},
4251 abstract = {We propose an estimator for the conditional density p(Y |X) that can adapt for asymmetric heavy tails
4252 which might depend on X. Such estimators have important applications in finance and insurance. We
4253 draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter
4254 controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been
4255 replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain
4256 a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional
4257 density estimator, the parameters of the mixture estimator can be seen as functions of X and these
4258 functions learned. We show experimentally that this approach better models the conditional density in
4259 terms of likelihood than compared competing algorithms: conditional mixture models with other types
4260 of components and multivariate nonparametric models.},
4261 cat={T},topics={Unsupervised,Mining},
4262 }
4263
4264 @TECHREPORT{tr1284,
4265 author = {Larochelle, Hugo and Bengio, Yoshua},
4266 title = {Distributed Representation Prediction for Generalization to New Words},
4267 number = {1284},
4268 year = {2006},
4269 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4270 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dist_rep_pred_tr1284.pdf},
4271 abstract = {Learning distributed representations of symbols (e.g. words) has been used in several Natural Language Processing
4272 systems. Such representations can capture semantic or syntactic similarities between words, which permit to fight
4273 the curse of dimensionality when considering sequences of such words. Unfortunately, because these representations
4274 are learned only for a previously determined vocabulary of words, it is not clear how to obtain representations
4275 for new words. We present here an approach which gets around this problem by considering the distributed representations
4276 as predictions from low-level or domain-knowledge features of words. We report experiments on a Part
4277 Of Speech tagging task, which demonstrates the success of this approach in learning meaningful representations and
4278 in providing improved accuracy, especially for new words.},
4279 cat={T},topics={HighDimensional,Language},
4280 }
4281
4282 @TECHREPORT{tr1285,
4283 author = {Grandvalet, Yves and Bengio, Yoshua},
4284 title = {Hypothesis Testing for Cross-Validation},
4285 number = {1285},
4286 year = {2006},
4287 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4288 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/xv_rho_stat_tr1285.pdf},
4289 abstract = {K-fold cross-validation produces variable estimates, whose variance cannot be estimated unbiasedly. However, in practice, one would like to
4290 provide a figure related to the variability of this estimate. The first part
4291 of this paper lists a series of restrictive assumptions (on the distribution of
4292 cross-validation residuals) that allow to derive unbiased estimates. We exhibit three such estimates, corresponding to differing assumptions. Their
4293 similar expressions however entail almost identical empirical behaviors.
4294 Then, we look for a conservative test for detecting significant differences
4295 in performances between two algorithms. Our proposal is based on the
4296 derivation of the form of a t-statistic parametrized by the correlation of
4297 residuals between each validation set. Its calibration is compared to the
4298 usual t-test. While the latter is overconfident in concluding that differences are indeed significant, our test is bound to be more skeptical, with
4299 smaller type-I error.},
4300 cat={T},topics={ModelSelection,Comparative},
4301 }
4302
4303 @TECHREPORT{tr1286,
4304 author = {Erhan, Dumitru and Bengio, Yoshua and {L'Heureux}, Pierre-Jean and Yue, Shi Yi},
4305 title = {Generalizing to a Zero-Data Task: a Computational Chemistry Case Study},
4306 number = {1286},
4307 year = {2006},
4308 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4309 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mt_qsar_tr1286.pdf},
4310 abstract = {We investigate the problem of learning several tasks simultaneously in order to transfer the acquired
4311 knowledge to a completely new task for which no training data are available. Assuming that the tasks
4312 share some representation that we can discover efficiently, such a scenario should lead to a better model of
4313 the new task, as compared to the model that is learned by only using the knowledge of the new task. We
4314 have evaluated several supervised learning algorithms in order to discover shared representations among
4315 the tasks defined in a computational chemistry/drug discovery problem. We have cast the problem from
4316 a statistical learning point of view and set up the general hypotheses that have to be tested in order
4317 to validate the multi-task learning approach. We have then evaluated the performance of the learning
4318 algorithms and showed that it is indeed possible to learn a shared representation of the tasks that allows
4319 to generalize to a new task for which no training data are available. From a theoretical point of view,
4320 our contribution also comprises a modification to the Support Vector Machine algorithm, which can
4321 produce state-of-the-art results using multi-task learning concepts at its core. From a practical point
4322 of view, our contribution is that this algorithm can be readily used by pharmaceutical companies for
4323 virtual screening campaigns.},
4324 cat={T},topics={MultiTask,Kernel,Bioinformatic},
4325 }
4326
4327 @INPROCEEDINGS{Turian+al-2009,
4328 author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua},
4329 title = {Quadratic Features and Deep Architectures for Chunking},
4330 booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)},
4331 year = {2009},
4332 abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
4333 }
4334
4335 @INPROCEEDINGS{Turian+al-2010,
4336 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua and Roth, Dan},
4337 title = {A preliminary evaluation of word representations for named-entity recognition},
4338 booktitle = {NIPS Workshop on Grammar Induction, Representation of Language and Language Learning},
4339 year = {2009},
4340 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/wordrepresentations-ner.pdf},
4341 abstract = {We use different word representations as word features for a named-entity recognition (NER) system with a linear model. This work is part of a larger empirical survey, evaluating different word representations on different NLP tasks. We evaluate Brown clusters, Collobert and Weston (2008) embeddings, and HLBL (Mnih & Hinton, 2009) embeddings of words. All three representations improve accuracy on NER, with the Brown clusters providing a larger improvement than the two embeddings, and the HLBL embeddings more than the Collobert and Weston (2008) embeddings. We also discuss some of the practical issues in using embeddings as features. Brown clusters are simpler than embeddings because they require less hyperparameter tuning.}
4342 }
4343
4344 @INPROCEEDINGS{Turian+Ratinov+Bengio-2010,
4345 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
4346 title = {Word representations: A simple and general method for semi-supervised learning},
4347 booktitle = {Association for Computational Linguistics(ACL2010)},
4348 year = {2010}
4349 }
4350
4351 @INPROCEEDINGS{Vincent-Bengio-2003,
4352 author = {Vincent, Pascal and Bengio, Yoshua},
4353 title = {Manifold Parzen Windows},
4354 year = {2003},
4355 pages = {825--832},
4356 crossref = {NIPS15-shorter},
4357 abstract = {The similarity between objects is a fundamental element of many learning algorithms. Most non-parametric methods take this similarity to be fixed, but much recent work has shown the advantages of learning it, in particular to exploit the local invariances in the data or to capture the possibly non-linear manifold on which most of the data lies. We propose a new non-parametric kernel density estimation method which captures the local structure of an underlying manifold through the leading eigenvectors of regularized local covariance matrices. Experiments in density estimation show significant improvements with respect to Parzen density estimators. The density estimators can also be used within Bayes classifiers, yielding classification rates similar to {SVM}s and much superior to the Parzen classifier.},
4358 topics={HighDimensional,Kernel,Unsupervised},cat={C},
4359 }
4360
4361 @TECHREPORT{Vincent-TR1316,
4362 author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
4363 title = {Extracting and Composing Robust Features with Denoising Autoencoders},
4364 number = {1316},
4365 year = {2008},
4366 institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4367 url = {http://www.iro.umontreal.ca/~vincentp/Publications/denoising_autoencoders_tr1316.pdf},
4368 abstract = {Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite.}
4369 }
4370
4371 @PHDTHESIS{Vincent2003,
4372 author = {Vincent, Pascal},
4373 title = {Mod{\`{e}}les {\`{a}} Noyaux {\`{a}} Structure Locale},
4374 year = {2003},
4375 school = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4376 }
4377
4378 @ARTICLE{vincent:2001,
4379 author = {Vincent, Pascal and Bengio, Yoshua},
4380 title = {Kernel Matching Pursuit},
4381 journal = {Machine Learning},
4382 year = {2001},
4383 abstract = {We show how Matching Pursuit can be used to build kernel-based solutions to machine-learning problems while keeping control of the sparsity of the solution, and how it can be extended to use non-squared error loss functions. We also deriveMDL motivated generalization bounds for this type of algorithm. Finally, links to boosting algorithms and {RBF} training procedures, as well as extensive experimental comparison with {SVM}s are given, showing comparable results with typically sparser models.},
4384 topics={HighDimensional,Kernel},cat={J},
4385 }
4386
4387 @INPROCEEDINGS{VincentPLarochelleH2008,
4388 author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
4389 title = {Extracting and Composing Robust Features with Denoising Autoencoders},
4390 year = {2008},
4391 pages = {1096--1103},
4392 crossref = {ICML08-shorter},
4393 abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
4394 for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
4395 this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
4396 RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
4397 approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
4398 }
4399
4400 @TECHREPORT{visualization_techreport,
4401 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
4402 title = {Visualizing Higher-Layer Features of a Deep Network},
4403 number = {1341},
4404 year = {2009},
4405 institution = {University of Montreal},
4406 abstract = {Deep architectures have demonstrated state-of-the-art results in a variety of
4407 settings, especially with vision datasets. Beyond the model definitions and the quantitative analyses, there is a need for qualitative comparisons of the solutions learned by various deep architectures. The goal of this paper is to find good qualitative interpretations of high level features represented by such models. To this end, we contrast and compare several techniques applied on Stacked Denoising Autoencoders and Deep Belief Networks, trained on several vision datasets. We show that, perhaps counter-intuitively, such interpretation is possible at the unit level, that it is simple to accomplish and that the results are consistent across various techniques. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work}
4408 }
4409
4410 @INPROCEEDINGS{xAISTATS2009-short,
4411 title = {Proc. AISTATS'2009},
4412 booktitle = {Proc. AISTATS'2009},
4413 year = {2009}
4414 }
4415
4416
4417 @MISC{Yoshua+al-snowbird-2008,
4418 author = {Bengio, Yoshua and Larochelle, Hugo and Turian, Joseph},
4419 title = {Deep Woods},
4420 year = {2008},
4421 howpublished = {Poster presented at the Learning@Snowbird Workshop, Snowbird, USA, 2008}
4422 }
4423
4424 @ARTICLE{Zaccaro-et-al-2005,
4425 author = {Zaccaro, Maria Clara and Boon, Hong and Pattarawarapan, Mookda and Xia, Zebin and Caron, Antoine and {L'Heureux}, Pierre-Jean and Bengio, Yoshua and Burgess, Kevin and Saragori, H. Uri},
4426 title = {Selective Small Molecule Peptidomimetic Ligands of TrkC and TrkA Receptors Afford Discrete or Complete Neurotrophic Activities},
4427 journal = {Chemistry \& Biology},
4428 volume = {12},
4429 number = {9},
4430 year = {2005},
4431 pages = {1015--1028}
4432 }
4433
4434
4435
4436 crossreferenced publications:
4437 @INPROCEEDINGS{ICML09,
4438 editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
4439 title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
4440 booktitle = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
4441 year = {-1},
4442 publisher = {ACM}
4443 }
4444
4445 @INPROCEEDINGS{NIPS7,
4446 editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
4447 title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
4448 booktitle = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
4449 year = {-1},
4450 publisher = {MIT Press}
4451 }
4452
4453 @INPROCEEDINGS{NIPS6,
4454 editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
4455 title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
4456 booktitle = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
4457 year = {-1},
4458 publisher = {MIT Press}
4459 }
4460
4461 @INPROCEEDINGS{NIPS8,
4462 editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
4463 title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
4464 booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
4465 year = {-1},
4466 publisher = {MIT Press}
4467 }
4468
4469 @ARTICLE{JMLR,
4470 journal = {Journal of Machine Learning Research},
4471 year = {-1}
4472 }
4473
4474 @INPROCEEDINGS{NIPS19,
4475 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
4476 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4477 booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4478 year = {-1},
4479 publisher = {MIT Press}
4480 }
4481
4482 @INPROCEEDINGS{NIPS10,
4483 editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
4484 title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
4485 booktitle = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
4486 year = {-1},
4487 publisher = {MIT Press}
4488 }
4489
4490 @INPROCEEDINGS{NIPS1,
4491 editor = {Touretzky, D. S.},
4492 title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
4493 booktitle = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
4494 year = {-1},
4495 publisher = {Morgan Kaufmann}
4496 }
4497
4498 @INPROCEEDINGS{NIPS2,
4499 editor = {Touretzky, D. S.},
4500 title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
4501 booktitle = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
4502 year = {-1},
4503 publisher = {Morgan Kaufmann}
4504 }
4505
4506 @INPROCEEDINGS{NIPS4,
4507 editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
4508 title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
4509 booktitle = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
4510 year = {-1},
4511 publisher = {Morgan Kaufmann}
4512 }
4513
4514 @INPROCEEDINGS{NIPS12,
4515 editor = {Solla, S.A. and Leen, T. K.},
4516 title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
4517 booktitle = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
4518 year = {-1},
4519 publisher = {MIT Press}
4520 }
4521
4522 @INPROCEEDINGS{NIPS16,
4523 editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
4524 title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
4525 booktitle = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
4526 year = {-1}
4527 }
4528
4529 @INPROCEEDINGS{NIPS22,
4530 editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
4531 title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
4532 booktitle = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
4533 year = {-1}
4534 }
4535
4536 @INPROCEEDINGS{NIPS20,
4537 editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
4538 title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
4539 booktitle = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
4540 year = {-1},
4541 publisher = {MIT Press}
4542 }
4543
4544 @INPROCEEDINGS{xAISTATS2009,
4545 title = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
4546 booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
4547 year = {2009},
4548 }
4549
4550 @INPROCEEDINGS{NIPS9,
4551 editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
4552 title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
4553 booktitle = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
4554 year = {-1},
4555 publisher = {MIT Press}
4556 }
4557
4558 @INPROCEEDINGS{NIPS17,
4559 editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
4560 title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
4561 booktitle = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
4562 year = {-1}
4563 }
4564
4565 @INPROCEEDINGS{ICML08,
4566 editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
4567 title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
4568 booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
4569 year = {-1},
4570 publisher = {ACM}
4571 }
4572
4573 @INPROCEEDINGS{ICML07,
4574 editor = {Ghahramani, Zoubin},
4575 title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
4576 booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
4577 year = {-1},
4578 publisher = {ACM}
4579 }
4580
4581 @TECHREPORT{DIRO,
4582 title = {DIRO},
4583 year = {-1},
4584 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4585 }
4586
4587 @INPROCEEDINGS{NIPS18,
4588 editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
4589 title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
4590 booktitle = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
4591 year = {-1},
4592 publisher = {MIT Press}
4593 }
4594
4595 @INPROCEEDINGS{NIPS13,
4596 editor = {Leen, T. K. and Dietterich, T.G.},
4597 title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
4598 booktitle = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
4599 year = {-1},
4600 publisher = {MIT Press}
4601 }
4602
4603 @INPROCEEDINGS{ICML05,
4604 editor = {Raedt, Luc De and Wrobel, Stefan},
4605 title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
4606 booktitle = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
4607 year = {-1},
4608 publisher = {ACM}
4609 }
4610
4611 @INPROCEEDINGS{ICML06,
4612 editor = {Cohen, William W. and Moore, Andrew},
4613 title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
4614 booktitle = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
4615 year = {-1},
4616 publisher = {ACM}
4617 }
4618
4619 @INPROCEEDINGS{NIPS15,
4620 editor = {Becker, S. and Thrun, Sebastian},
4621 title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
4622 booktitle = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
4623 year = {-1},
4624 publisher = {MIT Press}
4625 }
4626
4627 @INPROCEEDINGS{ICML01-shorter,
4628 title = {ICML'01},
4629 booktitle = {ICML'01},
4630 year = {-1},
4631 publisher = {Morgan Kaufmann}
4632 }
4633 @INPROCEEDINGS{ICML02-shorter,
4634 title = {ICML'02},
4635 booktitle = {ICML'02},
4636 year = {-1},
4637 publisher = {Morgan Kaufmann}
4638 }
4639 @INPROCEEDINGS{ICML03-shorter,
4640 title = {ICML'03},
4641 booktitle = {ICML'03},
4642 year = {-1},
4643 publisher = {AAAI Press}
4644 }
4645 @INPROCEEDINGS{ICML04-shorter,
4646 title = {ICML'04},
4647 booktitle = {ICML'04},
4648 year = {-1},
4649 publisher = {ACM}
4650 }
4651 @INPROCEEDINGS{ICML05-shorter,
4652 title = {ICML'05},
4653 booktitle = {ICML'05},
4654 year = {-1},
4655 publisher = {ACM}
4656 }
4657 @INPROCEEDINGS{ICML06-shorter,
4658 title = {ICML'06},
4659 booktitle = {ICML'06},
4660 year = {-1},
4661 publisher = {ACM}
4662 }
4663 @INPROCEEDINGS{ICML07-shorter,
4664 title = {ICML'07},
4665 booktitle = {ICML'07},
4666 year = {-1},
4667 publisher = {ACM}
4668 }
4669 @INPROCEEDINGS{ICML08-shorter,
4670 title = {ICML'08},
4671 booktitle = {ICML'08},
4672 year = {-1},
4673 publisher = {ACM}
4674 }
4675 @INPROCEEDINGS{ICML09-shorter,
4676 title = {ICML'09},
4677 booktitle = {ICML'09},
4678 year = {-1},
4679 publisher = {ACM}
4680 }
4681 @INPROCEEDINGS{ICML96-shorter,
4682 title = {ICML'96},
4683 booktitle = {ICML'96},
4684 year = {-1},
4685 publisher = {Morgan Kaufmann}
4686 }
4687 @INPROCEEDINGS{ICML97-shorter,
4688 title = {ICML'97},
4689 booktitle = {ICML'97},
4690 year = {-1},
4691 publisher = {Morgan Kaufmann}
4692 }
4693 @INPROCEEDINGS{ICML98-shorter,
4694 title = {ICML'98},
4695 booktitle = {ICML'98},
4696 year = {-1},
4697 publisher = {Morgan Kaufmann}
4698 }
4699 @INPROCEEDINGS{ICML99-shorter,
4700 title = {ICML'99},
4701 booktitle = {ICML'99},
4702 year = {-1},
4703 publisher = {Morgan Kaufmann}
4704 }
4705 @ARTICLE{JMLR-shorter,
4706 journal = {JMLR},
4707 year = {-1}
4708 }
4709 @INPROCEEDINGS{NIPS1-shorter,
4710 title = {NIPS'88},
4711 booktitle = {NIPS 1},
4712 year = {-1},
4713 publisher = {Morgan Kaufmann}
4714 }
4715 @INPROCEEDINGS{NIPS10-shorter,
4716 title = {NIPS'97},
4717 booktitle = {NIPS 10},
4718 year = {-1},
4719 publisher = {MIT Press}
4720 }
4721 @INPROCEEDINGS{NIPS11-shorter,
4722 title = {NIPS'98},
4723 booktitle = {NIPS 11},
4724 year = {-1},
4725 publisher = {MIT Press}
4726 }
4727 @INPROCEEDINGS{NIPS12-shorter,
4728 title = {NIPS'99},
4729 booktitle = {NIPS 12},
4730 year = {-1},
4731 publisher = {MIT Press}
4732 }
4733 @INPROCEEDINGS{NIPS13-shorter,
4734 title = {NIPS'00},
4735 booktitle = {NIPS 13},
4736 year = {-1},
4737 publisher = {MIT Press}
4738 }
4739 @INPROCEEDINGS{NIPS14-shorter,
4740 title = {NIPS'01},
4741 booktitle = {NIPS 14},
4742 year = {-1},
4743 publisher = {MIT Press}
4744 }
4745 @INPROCEEDINGS{NIPS15-shorter,
4746 title = {NIPS'02},
4747 booktitle = {NIPS 15},
4748 year = {-1},
4749 publisher = {MIT Press}
4750 }
4751 @INPROCEEDINGS{NIPS16-shorter,
4752 title = {NIPS'03},
4753 booktitle = {NIPS 16},
4754 year = {-1}
4755 }
4756 @INPROCEEDINGS{NIPS17-shorter,
4757 title = {NIPS'04},
4758 booktitle = {NIPS 17},
4759 year = {-1}
4760 }
4761 @INPROCEEDINGS{NIPS18-shorter,
4762 title = {NIPS'05},
4763 booktitle = {NIPS 18},
4764 year = {-1},
4765 publisher = {MIT Press}
4766 }
4767 @INPROCEEDINGS{NIPS19-shorter,
4768 title = {NIPS'06},
4769 booktitle = {NIPS 19},
4770 year = {-1},
4771 publisher = {MIT Press}
4772 }
4773 @INPROCEEDINGS{NIPS2-shorter,
4774 title = {NIPS'89},
4775 booktitle = {NIPS 2},
4776 year = {-1},
4777 publisher = {Morgan Kaufmann}
4778 }
4779 @INPROCEEDINGS{NIPS20-shorter,
4780 title = {NIPS'07},
4781 booktitle = {NIPS 20},
4782 year = {-1},
4783 publisher = {MIT Press}
4784 }
4785 @INPROCEEDINGS{NIPS21-shorter,
4786 title = {NIPS'08},
4787 booktitle = {NIPS 21},
4788 year = {-1},
4789 publisher = {Nips Foundation (http://books.nips.cc)}
4790 }
4791 @INPROCEEDINGS{NIPS22-shorter,
4792 title = {NIPS'09},
4793 booktitle = {NIPS 22},
4794 year = {-1}
4795 }
4796 @INPROCEEDINGS{NIPS3-shorter,
4797 title = {NIPS'90},
4798 booktitle = {NIPS 3},
4799 year = {-1},
4800 publisher = {Morgan Kaufmann}
4801 }
4802 @INPROCEEDINGS{NIPS4-shorter,
4803 title = {NIPS'91},
4804 booktitle = {NIPS 4},
4805 year = {-1},
4806 publisher = {Morgan Kaufmann}
4807 }
4808 @INPROCEEDINGS{NIPS5-shorter,
4809 title = {NIPS'92},
4810 booktitle = {NIPS 5},
4811 year = {-1},
4812 publisher = {Morgan Kaufmann}
4813 }
4814 @INPROCEEDINGS{NIPS6-shorter,
4815 title = {NIPS'93},
4816 booktitle = {NIPS 6},
4817 year = {-1},
4818 publisher = {MIT Press}
4819 }
4820 @INPROCEEDINGS{NIPS7-shorter,
4821 title = {NIPS'94},
4822 booktitle = {NIPS 7},
4823 year = {-1},
4824 publisher = {MIT Press}
4825 }
4826 @INPROCEEDINGS{NIPS8-shorter,
4827 title = {NIPS'95},
4828 booktitle = {NIPS 8},
4829 year = {-1},
4830 publisher = {MIT Press}
4831 }
4832 @INPROCEEDINGS{NIPS9-shorter,
4833 title = {NIPS'96},
4834 booktitle = {NIPS 9},
4835 year = {-1},
4836 publisher = {MIT Press}
4837 }
4838 @INPROCEEDINGS{xAISTATS2009-shorter,
4839 title = {AISTATS'2009},
4840 booktitle = {AISTATS'2009},
4841 year = {-1}
4842 }