comparison writeup/mlj_submission/aigaion-shorter.bib @ 587:b1be957dd1be

Added mlj_submission to group every file needed for that.
author fsavard
date Thu, 30 Sep 2010 17:51:02 -0400
parents
children
comparison
equal deleted inserted replaced
585:4933077b8676 587:b1be957dd1be
1 %Aigaion2 BibTeX export from LISA - Publications
2 %Tuesday 01 June 2010 10:46:52 AM
3 @INPROCEEDINGS{Attardi+al-2009,
4 author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
5 keywords = {classifier, dependency parsing, natural language, parser, perceptron},
6 title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron},
7 booktitle = {Proceeding of Evalita 2009},
8 series = {LNCS},
9 year = {2009},
10 publisher = {Springer},
11 abstract = {Abstract. DeSR is a statistical transition-based dependency parser which learns from annotated corpora which actions to perform for building parse trees while scanning a sentence. We describe recent improvements to the parser, in particular stacked parsing, exploiting a beam search strategy and using a Multilayer Perceptron classifier. For the Evalita 2009 Dependency Parsing task DesR was configured to use a combination of stacked parsers. The stacked combination achieved the best accuracy scores in both the main and pilot subtasks. The contribution to the result of various choices is analyzed, in particular for taking advantage of the peculiar features of the TUT Treebank.}
12 }
13
14 @INPROCEEDINGS{Bengio+al-2009,
15 author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
16 title = {Curriculum Learning},
17 year = {2009},
18 crossref = {ICML09-shorter},
19 abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved by using a particular curriculum, i.e., the selection and order of training examples. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
20 }
21
22 @TECHREPORT{Bengio+al-2009-TR,
23 author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
24 title = {Curriculum Learning},
25 number = {1330},
26 year = {2009},
27 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
28 abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
29 }
30
31 @MISC{Bengio+al-patent-2000,
32 author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and {LeCun}, Yann},
33 title = {Module for constructing trainable modular network in which each module outputs and inputs data structured as a graph},
34 year = {2000},
35 howpublished = {U.S. Patent 6,128,606, October 3}
36 }
37
38 @MISC{Bengio+al-patent-2001,
39 author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and G. Howard, Paul},
40 title = {Z-Coder : a fast adaptive binary arithmetic coder},
41 year = {2001},
42 howpublished = {U.S. Patent 6,188,334, February 13, 2001, along with patents 6,225,925, 6,281,817, and 6,476,740}
43 }
44
45 @MISC{Bengio+al-patent-94,
46 author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
47 title = {Visitor Registration System Using Automatic Handwriting Recognition},
48 year = {1994},
49 howpublished = {Patent submitted in the U.S.A. in October 1994, submission number 1-16-18-1}
50 }
51
52 @INCOLLECTION{Bengio+al-spectral-2006,
53 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
54 editor = {Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti},
55 title = {Spectral Dimensionality Reduction},
56 booktitle = {Feature Extraction, Foundations and Applications},
57 year = {2006},
58 publisher = {Springer},
59 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/eigenfn_chapter.pdf},
60 abstract = {In this chapter, we study and put under a common framework a number
61 of non-linear dimensionality reduction methods, such as Locally Linear Embedding,
62 Isomap, Laplacian eigenmaps and kernel {PCA}, which are based
63 on performing an eigen-decomposition (hence the name "spectral"). That
64 framework also includes classical methods such as {PCA} and metric multidimensional
65 scaling ({MDS}). It also includes the data transformation step used
66 in spectral clustering. We show that in all of these cases the learning algorithm
67 estimates the principal eigenfunctions of an operator that depends on
68 the unknown data density and on a kernel that is not necessarily positive
69 semi-definite. This helps to generalize some of these algorithms so as to predict
70 an embedding for out-of-sample examples without having to retrain the
71 model. It also makes it more transparent what these algorithm are minimizing
72 on the empirical data and gives a corresponding notion of generalization
73 error.},
74 cat={B},topics={HighDimensional,Kernel,Unsupervised},
75 }
76
77 @INCOLLECTION{Bengio+al-ssl-2006,
78 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
79 editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
80 title = {Label Propagation and Quadratic Criterion},
81 booktitle = {Semi-Supervised Learning},
82 year = {2006},
83 pages = {193--216},
84 publisher = {{MIT} Press},
85 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_ssl.pdf},
86 abstract = {Various graph-based algorithms for semi-supervised learning have been proposed in
87 the recent literature. They rely on the idea of building a graph whose nodes are
88 data points (labeled and unlabeled) and edges represent similarities between points.
89 Known labels are used to propagate information through the graph in order to label
90 all nodes. In this chapter, we show how these different algorithms can be cast into
91 a common framework where one minimizes a quadratic cost criterion whose closed-form solution is found by solving a linear system of size n (total number of data
92 points). The cost criterion naturally leads to an extension of such algorithms to
93 the inductive setting, where one obtains test samples one at a time: the derived
94 induction formula can be evaluated in O(n) time, which is much more efficient
95 than solving again exactly the linear system (which in general costs O(kn2) time
96 for a sparse graph where each data point has k neighbors). We also use this inductive
97 formula to show that when the similarity between points satisfies a locality property,
98 then the algorithms are plagued by the curse of dimensionality, with respect to the
99 dimensionality of an underlying manifold.},
100 cat={B},topics={Unsupervised},
101 }
102
103 @TECHREPORT{Bengio+al-treecurse-2007,
104 author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
105 title = {Decision Trees do not Generalize to New Variations},
106 number = {1304},
107 year = {2007},
108 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
109 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf}
110 }
111
112 @INPROCEEDINGS{Bengio+Bengio96,
113 author = {Bengio, Samy and Bengio, Yoshua},
114 editor = {Xu, L.},
115 title = {An {EM} Algorithm for Asynchronous Input/Output Hidden {M}arkov Models},
116 booktitle = {International Conference On Neural Information Processing},
117 year = {1996},
118 pages = {328--334},
119 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iconip96.pdf},
120 abstract = {In learning tasks in which input sequences are mapped to output sequences, it is often the case that the input and output sequences are not synchronous. For example, in speech recognition, acoustic sequences are longer than phoneme sequences. Input/Output Hidden {Markov} Models have already been proposed to represent the distribution of an output sequence given an input sequence of the same length. We extend here this model to the case of asynchronous sequences_ and show an Expectation-Maximization algorithm for training such models.},
121 topics={Markov},cat={C},
122 }
123
124 @INCOLLECTION{Bengio+chapter2007,
125 author = {Bengio, Yoshua and {LeCun}, Yann},
126 editor = {Bottou, {L{\'{e}}on} and Chapelle, Olivier and DeCoste, D. and Weston, J.},
127 title = {Scaling Learning Algorithms towards {AI}},
128 booktitle = {Large Scale Kernel Machines},
129 year = {2007},
130 publisher = {MIT Press},
131 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+lecun_chapter2007.pdf},
132 abstract = {One long-term goal of machine learning research is to produce methods that
133 are applicable to highly complex tasks, such as perception (vision, audition), reasoning,
134 intelligent control, and other artificially intelligent behaviors. We argue
135 that in order to progress toward this goal, the Machine Learning community must
136 endeavor to discover algorithms that can learn highly complex functions, with minimal
137 need for prior knowledge, and with minimal human intervention. We present
138 mathematical and empirical evidence suggesting that many popular approaches
139 to non-parametric learning, particularly kernel methods, are fundamentally limited
140 in their ability to learn complex high-dimensional functions. Our analysis
141 focuses on two problems. First, kernel machines are shallow architectures, in
142 which one large layer of simple template matchers is followed by a single layer
143 of trainable coefficients. We argue that shallow architectures can be very inefficient
144 in terms of required number of computational elements and examples. Second,
145 we analyze a limitation of kernel machines with a local kernel, linked to the
146 curse of dimensionality, that applies to supervised, unsupervised (manifold learning)
147 and semi-supervised kernel machines. Using empirical results on invariant
148 image recognition tasks, kernel methods are compared with deep architectures, in
149 which lower-level features or concepts are progressively combined into more abstract
150 and higher-level representations. We argue that deep architectures have the
151 potential to generalize in non-local ways, i.e., beyond immediate neighbors, and
152 that this is crucial in order to make progress on the kind of complex tasks required
153 for artificial intelligence.},
154 cat={B},topics={HighDimensional},
155 }
156
157 @ARTICLE{Bengio+Delalleau-2009,
158 author = {Bengio, Yoshua and Delalleau, Olivier},
159 title = {Justifying and Generalizing Contrastive Divergence},
160 journal = {Neural Computation},
161 volume = {21},
162 number = {6},
163 year = {2009},
164 pages = {1601--1621},
165 abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its residual term converges to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence (CD) estimator of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain. We present theoretical and empirical evidence linking the number of Gibbs steps $k$ and the magnitude of the RBM parameters to the bias in the CD estimator. These experiments also suggest that the sign of the CD estimator is correct most of the time, even when the bias is large, so that CD-$k$ is a good descent direction even for small $k$.}
166 }
167
168 @TECHREPORT{Bengio+Delalleau-TR2007,
169 author = {Bengio, Yoshua and Delalleau, Olivier},
170 keywords = {Contrastive Divergence, Restricted {Boltzmann} Machine},
171 title = {Justifying and Generalizing Contrastive Divergence},
172 number = {1311},
173 year = {2007},
174 institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
175 abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its terms converge to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence approximation of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain.}
176 }
177
178 @INPROCEEDINGS{Bengio+DeMori88,
179 author = {Bengio, Yoshua and De Mori, Renato},
180 title = {Use of neural networks for the recognition of place of articulation},
181 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
182 year = {1988},
183 pages = {103--106},
184 topics={Speech},cat={C},
185 }
186
187 @INPROCEEDINGS{Bengio+DeMori89,
188 author = {Bengio, Yoshua and Cardin, Regis and Cosi, Piero and De Mori, Renato},
189 title = {Speech coding with multi-layer networks},
190 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
191 year = {1989},
192 pages = {164--167},
193 topics={Speech},cat={C},
194 }
195
196 @INCOLLECTION{Bengio+DeMori90a,
197 author = {Bengio, Yoshua and De Mori, Renato},
198 editor = {Sethi, I. K. and Jain, A. K.},
199 title = {Connectionist models and their application to automatic speech recognition},
200 booktitle = {Artificial Neural Networks and Statistical Pattern Recognition: Old and New Connections},
201 year = {1990},
202 pages = {175--192},
203 publisher = {Elsevier, Machine Intelligence and Pattern Recognition Series},
204 topics={Speech},cat={B},
205 }
206
207 @ARTICLE{Bengio+Frasconi-jair95,
208 author = {Bengio, Yoshua and Frasconi, Paolo},
209 title = {Diffusion of Context and Credit Information in {M}arkovian Models},
210 journal = {Journal of Artificial Intelligence Research},
211 volume = {3},
212 year = {1995},
213 pages = {249--270},
214 abstract = {This paper studies the problem of ergodicity of transition probability matrices in {Markovian} models, such as hidden {Markov} models ({HMM}s), and how it makes very difficult the task of learning to represent long-term context for sequential data. This phenomenon hurts the forward propagation of long-term context information, as well as learning a hidden state representation to represent long-term context, which depends on propagating credit information backwards in time. Using results from {Markov} chain theory, we show that this problem of diffusion of context and credit is reduced when the transition probabilities approach 0 or 1, i.e., the transition probability matrices are sparse and the model essentially deterministic. The results found in this paper apply to learning approaches based on continuous optimization, such as gradient descent and the Baum-Welch algorithm.},
215 topics={Markov,LongTerm},cat={J},
216 }
217
218 @INPROCEEDINGS{Bengio+Frasconi-nips7-diffuse,
219 author = {Bengio, Yoshua and Frasconi, Paolo},
220 title = {Diffusion of Credit in {M}arkovian Models},
221 year = {1995},
222 pages = {553--560},
223 crossref = {NIPS7-shorter},
224 abstract = {This paper studies the problem of diffusion in {Markovian} models, such as hidden {Markov} models ({HMM}s) and how it makes very difficult the task of learning of long-term dependencies in sequences. Using results from {Markov} chain theory, we show that the problem of diffusion is reduced if the transition probabilities approach 0 or 1. Under this condition, standard {HMM}s have very limited modeling capabilities, but input/output {HMM}s can still perform interesting computations.},
225 topics={Markov},cat={C},
226 }
227
228 @INPROCEEDINGS{Bengio+Frasconi-nips7-iohmms,
229 author = {Bengio, Yoshua and Frasconi, Paolo},
230 title = {An Input/Output {HMM} Architecture},
231 year = {1995},
232 pages = {427--434},
233 crossref = {NIPS7-shorter},
234 abstract = {We introduce a recurrent architecture having a modular structure and we formulate a training procedure based on the {EM} algorithm. The resulting model has similarities to hidden {Markov} models, but supports recurrent networks processing style and allows to exploit the supervised learning paradigm while using maximum likelihood estimation.},
235 topics={Markov},cat={C},
236 }
237
238 @INPROCEEDINGS{Bengio+Frasconi-nips94,
239 author = {Bengio, Yoshua and Frasconi, Paolo},
240 title = {Credit Assignment through Time: Alternatives to Backpropagation},
241 year = {1994},
242 pages = {75--82},
243 crossref = {NIPS6-shorter},
244 abstract = {Learning to recognize or predict sequences using long-term context has many applications. However, practical and theoretical problems are found in training recurrent neural networks to perform tasks in which input/output dependencies span long intervals. Starting from a mathematical analysis of the problem, we consider and compare alternative algorithms and architectures on tasks for which the span of the input/output dependencies can be controlled. Results on the new algorithms show performance qualitatively superior to that obtained with backpropagation.},
245 topics={LongTerm},cat={C},
246 }
247
248 @ARTICLE{Bengio+Pouliot90,
249 author = {Bengio, Yoshua and Pouliot, Yannick},
250 title = {Efficient recognition of immunoglobulin domains from amino-acid sequences using a neural network},
251 journal = {Computer Applications in the Biosciences},
252 volume = {6},
253 number = {2},
254 year = {1990},
255 pages = {319--324},
256 topics={Bioinformatic,PriorKnowledge},cat={J},
257 }
258
259 @INPROCEEDINGS{Bengio+Senecal-2003,
260 author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
261 title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
262 booktitle = {Proceedings of the conference on Artificial Intelligence and Statistics (AISTATS)},
263 year = {2003},
264 abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we propose and evaluate sampling-based methods which require network passes only for the observed "positive example'' and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.}
265 }
266
267 @ARTICLE{Bengio+Senecal-2008,
268 author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
269 keywords = {Energy-based models, fast training, importance sampling, language modeling, Monte Carlo methods, probabilistic neural networks},
270 title = {Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model},
271 journal = {IEEE Transactions on Neural Networks},
272 volume = {19},
273 number = {4},
274 year = {2008},
275 pages = {713--722},
276 abstract = {Previous work on statistical language modeling has shown that it is possible to train a feedforward neural network to approximate probabilities over sequences of words, resulting in significant error reduction when compared to standard baseline models based on -grams. However, training the neural network model with the maximum-likelihood criterion requires computations proportional to the number of words in the vocabulary. In this paper, we introduce adaptive importance sampling as a way to accelerate training of the model. The idea is to use an adaptive n-gram model to track the conditional distributions produced by the neural network. We show that a very significant speedup can be obtained on standard problems.}
277 }
278
279 @INCOLLECTION{Bengio-2007,
280 author = {Bengio, Yoshua},
281 editor = {Cisek, Paul and Kalaska, John and Drew, Trevor},
282 title = {On the Challenge of Learning Complex Functions},
283 booktitle = {Computational Neuroscience: Theoretical Insights into Brain Function},
284 series = {Progress in Brain Research},
285 year = {2007},
286 publisher = {Elsevier},
287 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/PBR_chapter.pdf},
288 abstract = {A common goal of computational neuroscience and of artificial intelligence
289 research based on statistical learning algorithms is the discovery and
290 understanding of computational principles that could explain what we
291 consider adaptive intelligence, in animals as well as in machines. This
292 chapter focuses on what is required for the learning of complex behaviors. We
293 believe it involves the learning of highly varying functions, in a
294 mathematical sense. We bring forward two types of arguments which convey
295 the message that many currently popular machine learning approaches to
296 learning flexible functions have fundamental limitations that render them
297 inappropriate for learning highly varying functions. The first issue
298 concerns the representation of such functions with what we call shallow model
299 architectures. We discuss limitations of shallow architectures, such as
300 so-called kernel machines, boosting algorithms, and one-hidden-layer artificial neural
301 networks. The second issue is more focused and concerns kernel machines
302 with a local kernel (the type used most often in practice),
303 that act like a collection of template matching units. We present
304 mathematical results on such computational architectures showing that they
305 have a limitation similar to those already proved for older non-parametric
306 methods, and connected to the so-called curse of dimensionality. Though it has long
307 been believed that efficient learning in deep architectures is difficult,
308 recently proposed computational principles for learning in deep architectures
309 may offer a breakthrough.}
310 }
311
312 @ARTICLE{Bengio-2009,
313 author = {Bengio, Yoshua},
314 title = {Learning deep architectures for {AI}},
315 journal = {Foundations and Trends in Machine Learning},
316 volume = {2},
317 number = {1},
318 year = {2009},
319 pages = {1--127},
320 note = {Also published as a book. Now Publishers, 2009.},
321 abstract = {Theoretical results suggest that in order to learn the kind of
322 complicated functions that can represent high-level abstractions (e.g. in
323 vision, language, and other AI-level tasks), one may need {\insist deep
324 architectures}. Deep architectures are composed of multiple levels of non-linear
325 operations, such as in neural nets with many hidden layers or in complicated
326 propositional formulae re-using many sub-formulae. Searching the
327 parameter space of deep architectures is a difficult task, but
328 learning algorithms such as those for Deep Belief Networks have recently been proposed
329 to tackle this problem with notable success, beating the state-of-the-art
330 in certain areas. This paper discusses the motivations and principles regarding
331 learning algorithms for deep architectures, in particular those exploiting as
332 building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines,
333 used to construct deeper models such as Deep Belief Networks.}
334 }
335
336 @TECHREPORT{Bengio-96-TR,
337 author = {Bengio, Yoshua},
338 title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
339 number = {\#1019},
340 year = {1996},
341 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
342 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengioy_TR1019.pdf},
343 abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a rediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest. Experiments were performed on portfolio selection with 35 Canadian stocks.},
344 topics={Finance,Discriminant},cat={T},
345 }
346
347 @BOOK{bengio-book96,
348 author = {Bengio, Yoshua},
349 title = {Neural Networks for Speech and Sequence Recognition},
350 year = {1996},
351 publisher = {International Thompson Computer Press},
352 topics={Speech},cat={B},
353 }
354
355 @TECHREPORT{Bengio-convex-05,
356 author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
357 title = {Convex neural networks},
358 number = {1263},
359 year = {2005},
360 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
361 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1263.pdf},
362 abstract = {Convexity has recently received a lot of attention in the machine learning community, and the lack of convexity has been seen as a major disadvantage of many learning algorithms, such as multi-layer artificial neural networks. We how that training multi-layer neural networks in which the number of hidden units is learned can be viewed as a convex optimization problem. This problem involves an infinite number of variables, but can be solved by incrementally inserting a hidden unit at a time, each time finding a linear classifiers that minimizes a weighted sum of errors.},
363 topics={Boosting},cat={T},
364 }
365
366 @ARTICLE{Bengio-decision-trees10,
367 author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
368 title = {Decision Trees do not Generalize to New Variations},
369 journal = {Computational Intelligence},
370 year = {2010},
371 note = {To appear}
372 }
373
374 @ARTICLE{bengio-demori89,
375 author = {Bengio, Yoshua and De Mori, Renato},
376 title = {Use of multilayer networks for the recognition of phonetic features and phonemes},
377 journal = {Computational Intelligence},
378 volume = {5},
379 year = {1989},
380 pages = {134--141},
381 topics={Speech},cat={J},
382 }
383
384 @ARTICLE{Bengio-eigen-NC2004,
385 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
386 title = {Learning eigenfunctions links spectral embedding and kernel {PCA}},
387 journal = {Neural Computation},
388 volume = {16},
389 number = {10},
390 year = {2004},
391 pages = {2197--2219},
392 abstract = {In this paper, we show a direct relation between spectral embedding methods and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of an operator defined from a kernel and the unknown data generating density. Whereas spectral embedding methods only provided coordinates for the training points, the analysis justifies a simple extension to out-of-sample examples (the Nystr{\"{o}}m formula) for Multi-Dimensional Scaling, spectral clustering, Laplacian eigenmaps, Locally Linear Embedding ({LLE}) and Isomap. The analysis provides, for all such spectral embedding methods, the definition of a loss function, whose empirical average is minimized by the traditional algorithms. The asymptotic expected value of that loss defines a generalization performance and clarifies what these algorithms are trying to learn. Experiments with {LLE}, Isomap, spectral clustering and {MDS} show that this out-of-sample embedding formula generalizes well, with a level of error comparable to the effect of small perturbations of the training set on the embedding.},
393 topics={HighDimensional,Kernel,Unsupervised},cat={J},
394 }
395
396 @INPROCEEDINGS{Bengio-Gingras-nips8,
397 author = {Bengio, Yoshua and Gingras, Fran{\c c}ois},
398 title = {Recurrent Neural Networks for Missing or Asynchronous Data},
399 year = {1996},
400 pages = {395--401},
401 crossref = {NIPS8-shorter},
402 abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more discriminant approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},
403 topics={Finance,Missing},cat={C},
404 }
405
406 @ARTICLE{Bengio-Grandvalet-JMLR-04,
407 author = {Bengio, Yoshua and Grandvalet, Yves},
408 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
409 volume = {5},
410 year = {2004},
411 pages = {1089--1105},
412 journal = {Journal of Machine Learning Research},
413 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
414 topics={Comparative},cat={J},
415 }
416
417 @TECHREPORT{bengio-hyper-TR99,
418 author = {Bengio, Yoshua},
419 title = {Continuous Optimization of Hyper-Parameters},
420 number = {1144},
421 year = {1999},
422 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
423 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyperTR.pdf},
424 abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves (1) “training errors” on each training example and (2) some hyper-parameters, which are kept fixed during this minimization. When there is only a single hyper-parameter one can easily explore how its value aects a model selection criterion (that is not the same as the training criterion, and is used to select hyper-parameters). In this paper we present a methodology to select many hyper-parameters that is based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. We first consider the case of a training criterion that is quadratic in the parameters. In that case, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient, but this formula requires the computation of second derivatives of the training criterion},
425 topics={ModelSelection},cat={T},
426 }
427
428 @INPROCEEDINGS{Bengio-icnn93,
429 author = {Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice},
430 title = {The problem of learning long-term dependencies in recurrent networks},
431 booktitle = {IEEE International Conference on Neural Networks},
432 year = {1993},
433 pages = {1183--1195},
434 publisher = {IEEE Press},
435 note = {(invited paper)},
436 topics={LongTerm},cat={C},
437 }
438
439 @ARTICLE{Bengio-ijprai93,
440 author = {Bengio, Yoshua},
441 title = {A Connectionist Approach to Speech Recognition},
442 journal = {International Journal on Pattern Recognition and Artificial Intelligence},
443 volume = {7},
444 number = {4},
445 year = {1993},
446 pages = {647--668},
447 abstract = {The task discussed in this paper is that of learning to map input sequences to output sequences. In particular, problems of phoneme recognition in continuous speech are considered, but most of the discussed techniques could be applied to other tasks, such as the recognition of sequences of handwritten characters. The systems considered in this paper are based on connectionist models, or artificial neural networks, sometimes combined with statistical techniques for recognition of sequences of patterns, stressing the integration of prior knowledge and learning. Different architectures for sequence and speech recognition are reviewed, including recurrent networks as well as hybrid systems involving hidden {Markov} models.},
448 topics={PriorKnowledge,Speech},cat={J},
449 }
450
451 @TECHREPORT{Bengio-iohmms-TR99,
452 author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
453 title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
454 number = {1146},
455 year = {1999},
456 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
457 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iohmms-returnsTR.pdf},
458 abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditionned on an input sequence. For example, these conditional distributions can be linear, logistic, or non-linear (using for example multi-layer neural networks). We compare the generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments are performed on modeling the returns of market and sector indices. Note that the unconditional Gaussian estimates the first moment with the historical average. The results show that, although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s yielded significantly better performance, as measured by the out-of-sample likelihood.},
459 topics={Markov},cat={T},
460 }
461
462 @ARTICLE{bengio-lauzon-ducharme:2000,
463 author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
464 title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
465 journal = {IEEE Transaction on Neural Networks},
466 volume = {12},
467 number = {1},
468 year = {2001},
469 pages = {113--123},
470 abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditioned on an input sequence. For example, these conditional distributions can be logistic, or non-linear (using for example multi-layer neural networks). We compare generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments compare these models on predicting the conditional density of returns of market sector indices. Note that the unconditional Gaussian estimates the first moment the historical average. The results show that_ although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s significantly better performance, as estimated by the out-of-sample likelihood.},
471 topics={Markov,Finance},cat={J},
472 }
473
474 @INPROCEEDINGS{bengio-lecun-94,
475 author = {Bengio, Yoshua and {LeCun}, Yann},
476 title = {Word normalization for on-line handwritten word recognition},
477 booktitle = {Proc. of the International Conference on Pattern Recognition},
478 volume = {II},
479 year = {1994},
480 pages = {409--413},
481 publisher = {IEEE},
482 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-norm.ps},
483 abstract = {We introduce a new approach to normalizing words written with an electronic stylus that applies to all styles of handwriting (upper case, lower case, printed, cursive, or mixed). A geometrical model of the word spatial structure is fitted to the pen trajectory using the {EM} algorithm. The fitting process maximizes the likelihood of the trajectory given the model and a set a priors on its parameters. The method was evaluated and integrated to a recognition system that combines neural networks and hidden {Markov} models.},
484 topics={PriorKnowledge,Speech},cat={C},
485 }
486
487 @TECHREPORT{Bengio-localfailure-TR-2005,
488 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
489 title = {The Curse of Dimensionality for Local Kernel Machines},
490 number = {1258},
491 year = {2005},
492 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
493 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1258.pdf},
494 abstract = {We present a series of theoretical arguments supporting the claim that a large class of modern learning algorithms based on local kernels are sensitive to the curse of dimensionality. These include local manifold learning algorithms such as Isomap and {LLE}, support vector classifiers with Gaussian or other local kernels, and graph-based semisupervised learning algorithms using a local similarity function. These algorithms are shown to be local in the sense that crucial properties of the learned function at x depend mostly on the neighbors of x in the training set. This makes them sensitive to the curse of dimensionality, well studied for classical non-parametric statistical learning. There
495 is a large class of data distributions for which non-local solutions could be expressed compactly and potentially be learned with few examples, but which will require a large number of local bases and therefore a large number of training examples when using a local learning algorithm.},
496 topics={HighDimensional,Kernel,Unsupervised},cat={T},
497 }
498
499 @INPROCEEDINGS{Bengio-nips-2006,
500 author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
501 title = {Greedy Layer-Wise Training of Deep Networks},
502 year = {2007},
503 pages = {153--160},
504 crossref = {NIPS19-shorter},
505 abstract = {Complexity theory of circuits strongly suggests that deep architectures can be
506 much more efficient (sometimes exponentially) than shallow architectures,
507 in terms of computational elements required to represent some functions.
508 Deep multi-layer neural networks have many levels of non-linearities
509 allowing them to compactly represent highly non-linear and
510 highly-varying functions. However, until recently it was not clear how
511 to train such deep networks, since gradient-based
512 optimization starting from random initialization appears to often get stuck
513 in poor solutions. Hinton et al. recently introduced
514 a greedy layer-wise unsupervised learning algorithm for Deep Belief
515 Networks (DBN), a generative model with many layers of hidden causal
516 variables. In the context of the above optimization problem,
517 we study this algorithm empirically and explore variants to
518 better understand its success and extend it to cases where the inputs are
519 continuous or where the structure of the input distribution is not
520 revealing enough about the variable to be predicted in a supervised task.
521 Our experiments also confirm the hypothesis that the greedy
522 layer-wise unsupervised training strategy mostly helps the
523 optimization, by initializing weights in a region near a
524 good local minimum, giving rise to internal distributed representations
525 that are high-level abstractions of the input, bringing better generalization.}
526 }
527
528 @INPROCEEDINGS{Bengio-nips10,
529 author = {Bengio, Yoshua and Bengio, Samy and Isabelle, Jean-Fran{\c c}ois and Singer, Yoram},
530 title = {Shared Context Probabilistic Transducers},
531 year = {1998},
532 crossref = {NIPS10-shorter},
533 abstract = {Recently, a model for supervised learning of probabilistic transducers represented by suffix trees was introduced. However, this algorithm tends to build very large trees, requiring very large amounts of computer memory. In this paper, we propose a new, more compact, transducer model in which one shares the parameters of distributions associated to contexts yielding similar conditional output distributions. We illustrate the advantages of the proposed algorithm with comparative experiments on inducing a noun phrase recognizer.},
534 topics={HighDimensional},cat={C},
535 }
536
537 @TECHREPORT{Bengio-NLMP-TR-2005,
538 author = {Bengio, Yoshua and Larochelle, Hugo},
539 title = {Non-Local Manifold Parzen Windows},
540 number = {1264},
541 year = {2005},
542 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
543 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/NLMP-techreport.pdf},
544 abstract = {In order to escape from the curse of dimensionality, we claim that one can learn non-local functions, in the sense that the value and shape of the learned function at x must be inferred using examples that may be far from x. With this objective, we present a non-local non-parametric density estimator. It builds upon previously proposed Gaussian mixture models with regularized covariance matrices to take into account the local shape of the manifold. It also builds upon recent work on non-local estimators of the tangent plane of a manifold, which are able to generalize in places with little training data, unlike traditional, local, non-parametric models.},
545 topics={HighDimensional,Kernel,Unsupervised},cat={T},
546 }
547
548 @INPROCEEDINGS{Bengio-nncm96,
549 author = {Bengio, Yoshua},
550 editor = {Weigend, A.S. and Abu-Mostafa, Y.S. and Refenes, A. -P. N.},
551 title = {Training A Neural Network with a Financial Criterion Rather than a Prediction Criterion},
552 booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets ({NNCM}-96)},
553 year = {1997},
554 pages = {433--443},
555 publisher = {World Scientific},
556 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nncm.pdf},
557 abstract = {A common approach to quantitative decision taking with financial time-series is to train a model using a prediction criterion (e.g., squared error). We find on a portfolio selection problem that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest, with a differentiable decision module.},
558 topics={Finance,PriorKnowledge,Discriminant},cat={C},
559 }
560
561 @TECHREPORT{Bengio-NonStat-Hyper-TR,
562 author = {Bengio, Yoshua and Dugas, Charles},
563 title = {Learning Simple Non-Stationarities with Hyper-Parameters},
564 number = {1145},
565 year = {1999},
566 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
567 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nonstatTR.pdf},
568 abstract = {We consider sequential data that is sampled from an unknown process, so that the data are not necessarily i.i.d.. Most approaches to machine learning assume that data points are i.i.d.. Instead we consider a measure of generalization that does not make this assumption, and we consider in this context a recently proposed approach to optimizing hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to hyper-parameters. Here we use hyper-parameters that control a function that gives different weights to different time steps in the historical data sequence. The approach is successfully applied to modeling thev olatility of stock returns one month ahead. Comparative experiments with more traditional methods are presented.},
569 topics={ModelSelection,Finance},cat={T},
570 }
571
572 @ARTICLE{Bengio-scholarpedia-2007,
573 author = {Bengio, Yoshua},
574 title = {Neural net language models},
575 journal = {Scholarpedia},
576 volume = {3},
577 number = {1},
578 year = {2008},
579 pages = {3881},
580 abstract = {A language model is a function, or an algorithm for learning such a function, that captures the salient statistical characteristics of the distribution of sequences of words in a natural language, typically allowing one to make probabilistic predictions of the next word given preceding ones.
581
582 A neural network language model is a language model based on Neural Networks , exploiting their ability to learn distributed representations to reduce the impact of the curse of dimensionality.
583
584 In the context of learning algorithms, the curse of dimensionality refers to the need for huge numbers of training examples when learning highly complex functions. When the number of input variables increases, the number of required examples can grow exponentially. The curse of dimensionality arises when a huge number of different combinations of values of the input variables must be discriminated from each other, and the learning algorithm needs at least one example per relevant combination of values. In the context of language models, the problem comes from the huge number of possible sequences of words, e.g., with a sequence of 10 words taken from a vocabulary of 100,000 there are 10^{50} possible sequences...
585
586 A distributed representation of a symbol is a tuple (or vector) of features which characterize the meaning of the symbol, and are not mutually exclusive. If a human were to choose the features of a word, he might pick grammatical features like gender or plurality, as well as semantic features like animate" or invisible. With a neural network language model, one relies on the learning algorithm to discover these features, and the features are continuous-valued (making the optimization problem involved in learning much simpler).
587
588 The basic idea is to learn to associate each word in the dictionary with a continuous-valued vector representation. Each word corresponds to a point in a feature space. One can imagine that each dimension of that space corresponds to a semantic or grammatical characteristic of words. The hope is that functionally similar words get to be closer to each other in that space, at least along some directions. A sequence of words can thus be transformed into a sequence of these learned feature vectors. The neural network learns to map that sequence of feature vectors to a prediction of interest, such as the probability distribution over the next word in the sequence. What pushes the learned word features to correspond to a form of semantic and grammatical similarity is that when two words are functionally similar, they can be replaced by one another in the same context, helping the neural network to compactly represent a function that makes good predictions on the training set, the set of word sequences used to train the model.
589
590 The advantage of this distributed representation approach is that it allows the model to generalize well to sequences that are not in the set of training word sequences, but that are similar in terms of their features, i.e., their distributed representation. Because neural networks tend to map nearby inputs to nearby outputs, the predictions corresponding to word sequences with similar features are mapped to similar predictions. Because many different combinations of feature values are possible, a very large set of possible meanings can be represented compactly, allowing a model with a comparatively small number of parameters to fit a large training set.}
591 }
592
593 @TECHREPORT{Bengio-TR1312,
594 author = {Bengio, Yoshua},
595 title = {Learning deep architectures for AI},
596 number = {1312},
597 year = {2007},
598 institution = {Dept. IRO, Universite de Montreal},
599 note = {Preliminary version of journal article with the same title appearing in Foundations and Trends in Machine Learning (2009)},
600 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1312.pdf},
601 abstract = {Theoretical results strongly suggest that in order to learn the kind of
602 complicated functions that can represent high-level abstractions (e.g. in
603 vision, language, and other AI-level tasks), one may need deep
604 architectures. Deep architectures are composed of multiple levels of non-linear
605 operations, such as in neural nets with many hidden layers. Searching the
606 parameter space of deep architectures is a difficult optimization task, but
607 learning algorithms such as those for Deep Belief Networks have recently been proposed
608 to tackle this problem with notable success, beating the state-of-the-art
609 in certain areas. This paper discusses the motivations and principles regarding
610 learning algorithms for deep architectures and in particular for those based
611 on unsupervised learning such as Deep Belief Networks, using as building
612 blocks single-layer models such as Restricted {Boltzmann} Machines.}
613 }
614
615 @ARTICLE{Bengio-trnn94,
616 author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
617 title = {Learning Long-Term Dependencies with Gradient Descent is Difficult},
618 journal = {IEEE Transactions on Neural Networks},
619 volume = {5},
620 number = {2},
621 year = {1994},
622 pages = {157--166},
623 abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
624 optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J},
625 }
626
627 @INPROCEEDINGS{Bengio-wirn93,
628 author = {Bengio, Yoshua and Frasconi, Paolo and Gori, Marco and Soda, G.},
629 editor = {Caianello, E.},
630 title = {Recurrent Neural Networks for Adaptive Temporal Processing},
631 booktitle = {Proc. of the 6th Italian Workshop on Neural Networks, WIRN-93},
632 year = {1993},
633 pages = {1183--1195},
634 publisher = {World Scientific Publ.},
635 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rnn_review93.ps},
636 topics={LongTerm},cat={C},
637 }
638
639 @ARTICLE{Bengio2000c,
640 author = {Bengio, Yoshua},
641 title = {Gradient-Based Optimization of Hyperparameters},
642 journal = {Neural Computation},
643 volume = {12},
644 number = {8},
645 year = {2000},
646 pages = {1889--1900},
647 abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
648 topics={ModelSelection},cat={J},
649 }
650
651 @ARTICLE{Bengio89a,
652 author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Merlo, Ettore},
653 title = {Programmable execution of multi-layered networks for automatic speech recognition},
654 journal = {Communications of the Association for Computing Machinery},
655 volume = {32},
656 number = {2},
657 year = {1989},
658 pages = {195--199},
659 topics={Speech},cat={J},
660 }
661
662 @INPROCEEDINGS{Bengio89c,
663 author = {Bengio, Yoshua and Cosi, Piero and Cardin, Regis and De Mori, Renato},
664 title = {Use of multi-layered networks for coding speech with phonetic features},
665 year = {1989},
666 pages = {224--231},
667 crossref = {NIPS1-shorter},
668 abstract = {Preliminary results on speaker-independant speech recognition are reported. A method that combines expertise on neural networks with expertise on speech recognition is used to build the recognition systems. For transient sounds, event-driven property extractors with variable resolution in the time and frequency domains are used. For sonorant speech, a model of the human auditory system is preferred to FFT as a front-end module.},
669 topics={Speech},cat={C},
670 }
671
672 @INPROCEEDINGS{Bengio89d,
673 author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
674 title = {On the generalization capability of multilayered networks in the extraction of speech properties},
675 booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence},
676 year = {1989},
677 pages = {1531--1536},
678 publisher = {IEEE},
679 topics={Speech},cat={C},
680 }
681
682 @INPROCEEDINGS{Bengio90,
683 author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato},
684 title = {Speaker Independent Speech Recognition with Neural Networks and Speech Knowledge},
685 year = {1990},
686 pages = {218--225},
687 crossref = {NIPS2-shorter},
688 abstract = {We attempt to combine neural networks with knowledge from speech science to build a speaker independent speech recognition system. This knowledge is utilized in designing the preprocessing, input coding, output coding, output supervision and architectural constraints. To handle the temporal aspect of speech we combine delays, copies of activations of hidden and output units at the input level, and Back-Propagation for Sequences (BPS), a learning algorithm for networks with local self-loops. This strategy is demonstrated in several experiments, in particular a nasal discrimination task for which the application of a speech theory hypothesis dramatically improved generalization.},
689 topics={PriorKnowledge,Speech},cat={C},
690 }
691
692 @INCOLLECTION{Bengio90b,
693 author = {Bengio, Yoshua},
694 title = {Radial Basis Functions for speech recognition},
695 booktitle = {Speech Recognition and Understanding: Recent Advances, Trends and Applications},
696 year = {1990},
697 pages = {293--298},
698 publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
699 topics={Kernel,Speech},cat={B},
700 }
701
702 @INCOLLECTION{Bengio90c,
703 author = {Bengio, Yoshua and De Mori, Renato},
704 editor = {{Fogelman Soulie}, F. and Herault, J.},
705 title = {Speech coding with multilayer networks},
706 booktitle = {Neurocomputing: Algorithms, Architectures and Applications},
707 year = {1990},
708 pages = {207--216},
709 publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
710 topics={Speech},cat={B},
711 }
712
713 @INPROCEEDINGS{Bengio90e,
714 author = {Bengio, Yoshua and Pouliot, Yannick and Bengio, Samy and Agin, Patrick},
715 title = {A neural network to detect homologies in proteins},
716 year = {1990},
717 pages = {423--430},
718 crossref = {NIPS2-shorter},
719 abstract = {In order to detect the presence and location of immunoglobulin (Ig) domains from amino acid sequences we built a system based on a neural network with one hidden layer trained with back propagation. The program was designed to efficiently identify proteins exhibiting such domains, characterized by a few localized conserved regions and a low overall homology. When the National Biomedical Research Foundation (NBRF) NEW protein sequence database was scanned to evaluate the program's performance, we obtained very low rates of false negatives coupled with a moderate rate of false positives.},
720 topics={Bioinformatic,PriorKnowledge},cat={C},
721 }
722
723 @INPROCEEDINGS{Bengio90z,
724 author = {Bengio, Yoshua and De Mori, Renato and Gori, Marco},
725 editor = {Caianello, E.},
726 title = {Experiments on automatic speech recognition using BPS},
727 booktitle = {Parallel Architectures and Neural Networks},
728 year = {1990},
729 pages = {223--232},
730 publisher = {World Scientific Publ.},
731 topics={Speech},cat={C},
732 }
733
734 @INPROCEEDINGS{Bengio91a,
735 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
736 title = {A comparative study of hybrid acoustic phonetic decoders based on artificial neural networks},
737 booktitle = {Proceedings of EuroSpeech'91},
738 year = {1991},
739 topics={PriorKnowledge,Speech},cat={C},
740 }
741
742 @INPROCEEDINGS{Bengio91b,
743 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
744 title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
745 booktitle = {Proceedings of EuroSpeech'91},
746 year = {1991},
747 topics={Markov},cat={C},
748 }
749
750 @INPROCEEDINGS{Bengio91z,
751 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
752 title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
753 booktitle = {Proceedings of EuroSpeech'91},
754 year = {1991},
755 cat={C},
756 }
757
758 @ARTICLE{Bengio92b,
759 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
760 title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
761 journal = {Speech Communication},
762 volume = {11},
763 number = {2--3},
764 year = {1992},
765 pages = {261--271},
766 note = {Special issue on neurospeech},
767 topics={PriorKnowledge,Speech},cat={J},
768 }
769
770 @INPROCEEDINGS{Bengio92c,
771 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
772 title = {Neural Network - Gaussian Mixture Hybrid for Speech Recognition or Density Estimation},
773 year = {1992},
774 pages = {175--182},
775 crossref = {NIPS4-shorter},
776 abstract = {The subject of this paper is the integration of multi-layered Artificial Neural Networks ({ANN}) with probability density functions such as Gaussian mixtures found in continuous density hlidden {Markov} Models ({HMM}). In the first part of this paper we present an {ANN}/HMM hybrid in which all the parameters or the the system are simultaneously optimized with respect to a single criterion. In the second part of this paper, we study the relationship between the density of the inputs of the network and the density of the outputs of the networks. A rew experiments are presented to explore how to perform density estimation with {ANN}s.},
777 topics={Speech},cat={C},
778 }
779
780 @INPROCEEDINGS{Bengio94d,
781 author = {Frasconi, Paolo and Bengio, Yoshua},
782 title = {An {EM} Approach to Grammatical Inference: Input/Output {HMMs}},
783 booktitle = {International Conference on Pattern Recognition (ICPR'94)},
784 year = {1994},
785 pages = {289--294},
786 topics={Markov,LongTerm},cat={C},
787 }
788
789 @ARTICLE{Bengio96,
790 author = {Bengio, Yoshua and Frasconi, Paolo},
791 title = {Input/{O}utput {HMM}s for Sequence Processing},
792 journal = {IEEE Transactions on Neural Networks},
793 volume = {7},
794 number = {5},
795 year = {1996},
796 pages = {1231--1249},
797 abstract = {We consider problems of sequence processing and propose a solution based on a discrete state model in order to represent past context. We introduce a recurrent connectionist architecture having a modular structure that associates a subnetwork to each state. The model has a statistical interpretation we call Input/Output Hidden {Markov} Model ({IOHMM}). It can be trained by the {EM} or {GEM} algorithms, considering state trajectories as missing data, which decouples temporal credit assignment and actual parameter estimation.
798 The model presents similarities to hidden {Markov} models ({HMM}s), but allows us to map input sequences to output sequences, using the same processing style as recurrent neural networks. {IOHMM}s are trained using a more discriminant learning paradigm than {HMM}s, while potentially taking advantage of the {EM} algorithm.
799 We demonstrate that {IOHMM}s are well suited for solving grammatical inference problems on a benchmark problem. Experimental results are presented for the seven Tomita grammars, showing that these adaptive models can attain excellent generalization.},
800 topics={Markov},cat={J},
801 }
802
803 @TECHREPORT{Bengio96-hmmsTR,
804 author = {Bengio, Yoshua},
805 title = {Markovian Models for Sequential Data},
806 number = {1049},
807 year = {1996},
808 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
809 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hmmsTR.pdf},
810 abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many applications, especially for speech recognition. We first summarize the basics of {HMM}s, and then review several recent related learning algorithms and extensions of {HMM}s, including hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s, weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this area.},
811 topics={Markov},cat={T},
812 }
813
814 @ARTICLE{Bengio97,
815 author = {Bengio, Yoshua},
816 title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
817 journal = {International Journal of Neural Systems},
818 volume = {8},
819 number = {4},
820 year = {1997},
821 pages = {433--443},
822 note = {Special issue on noisy time-series},
823 abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a prediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to maximize the financial criterion of interest, here gains and losses (including those due to transactions) incurred during trading. Experiments were performed on portfolio selection with 35 Canadian stocks.},
824 topics={Finance,PriorKnowledge,Discriminant},cat={J},
825 }
826
827 @ARTICLE{Bengio99a,
828 author = {Bengio, Yoshua},
829 title = {Markovian Models for Sequential Data},
830 journal = {Neural Computing Surveys},
831 volume = {2},
832 year = {1999},
833 pages = {129--162},
834 abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many machine learning applications, especially for speech recognition. Furthermore? in the last few years, many new and promising probabilistic models related to {HMM}s have been proposed. We first summarize the basics of {HMM}s, arid then review several recent related learning algorithms and extensions of {HMM}s, including in particular hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s (which are conditional {HMM}s using neural networks to compute probabilities), weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this very active area.},
835 topics={Markov},cat={J},
836 }
837
838 @ARTICLE{Bengio99b,
839 author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
840 title = {Stochastic Learning of Strategic Equilibria for Auctions},
841 journal = {Neural Computation},
842 volume = {11},
843 number = {5},
844 year = {1999},
845 pages = {1199--1209},
846 abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
847 topics={Auction},cat={J},
848 }
849
850 @TECHREPORT{bengio:1990,
851 author = {Bengio, Yoshua},
852 title = {Learning a Synaptic Learning Rule},
853 number = {751},
854 year = {1990},
855 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
856 topics={BioRules},cat={T},
857 }
858
859 @INPROCEEDINGS{bengio:1990:snowbird,
860 author = {Bengio, Yoshua and R., De Mori},
861 title = {Recurrent networks with Radial Basis Functions for speech recognition},
862 booktitle = {1990 Neural Networks for Computing Conference},
863 year = {1990},
864 topics={Speech},cat={C},
865 }
866
867 @INPROCEEDINGS{bengio:1991:ijcnn,
868 author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
869 title = {Learning a Synaptic Learning Rule},
870 booktitle = {Proceedings of the International Joint Conference on Neural Networks},
871 year = {1991},
872 pages = {II--A969},
873 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1991_ijcnn.ps},
874 abstract = {This paper presents an original approach to neural modeling based on the idea of searching, with learning methods, for a synaptic learning rule which is biologically plausible, and yields networks that are able to learn to perform difficult tasks. The proposed method of automatically finding the learning rule relies on the idea of considering the synaptic modification rule as a parametric function. This function has local inputs and is the same in many neurons. The parameters that define this function can be estimated with known learning methods. For this optimization, we give particular attention to gradient descent and genetic algorithms. In both cases, estimation of this function consists of a joint global optimization of (a) the synaptic modification function, and (b) the networks that are learning to perform some tasks. The proposed methodology can be used as a tool to explore the missing pieces of the puzzle of neural networks learning. Both network architecture, and the learning function can be designed within constraints derived from biological knowledge.},
875 addressfr={Seattle, USA},topics={BioRules},cat={C},
876 }
877
878 @INPROCEEDINGS{bengio:1991:nnc,
879 author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
880 title = {Learning Synaptic Learning Rules},
881 booktitle = {Neural Networks for Computing},
882 year = {1991},
883 addressfr={Snowbird, Utah, USA},topics={BioRules},cat={C},
884 }
885
886 @INPROCEEDINGS{bengio:1991:snowbird,
887 author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
888 title = {Learning a Synaptic Learning Rule},
889 booktitle = {1991 Neural Networks for Computing Conference},
890 year = {1991},
891 topics={BioRules},cat={C},
892 }
893
894 @INPROCEEDINGS{bengio:1992:nn,
895 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
896 title = {Aspects th{\'{e}}oriques de l'optimisation d'une r{\`{e}}gle d'apprentissage},
897 booktitle = {Actes de la conf{\'{e}}rence Neuro-N{\^{\i}}mes 1992},
898 year = {1992},
899 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_nn.ps},
900 abstract = {Ayant expos{\'{e}} dans de pr{\'{e}}c{\'{e}}dentes publications (voir [Beng90, Beng92] notamment) l’id{\'{e}}e que l’on pouvait optimiser des r{\`{e}}gles d’apprentissage param{\'{e}}triques pour r{\'{e}}seaux de neurones, nous montrons dans cet article comment d{\'{e}}velopper, par la m{\'{e}}thode du Lagrangien, le gradient n{\'{e}}cessaire {\`{a}} l’optimisation d’une r{\`{e}}gle d’apprentissage par descente du gradient. Nous pr{\'{e}}sentons aussi les bases th{\'{e}}oriques qui permettent d’{\'{e}}tudier la g{\'{e}}n{\'{e}}ralisation {\`{a}} de nouvelles t{\^{a}}ches d’une r{\`{e}}gle d’apprentissage dont les param{\`{e}}tres ont {\'{e}}t{\'{e}} estim{\'{e}}s {\`{a}} partir d’un certain ensemble de t{\^{a}}ches. Enfin, nous exposons bri{\`{e}}vement les r{\'{e}}sultats d’une exp{\'{e}}rience consistant {\`{a}} trouver, par descente du gradient, une r{\`{e}}gle d’apprentissage pouvant r{\'{e}}soudre plusieurs t{\^{a}}ches bool{\'{e}}ennes lin{\'{e}}airement et non lin{\'{e}}airement s{\'{e}}parables.},
901 addressfr={N{\^i}es, France},topics={BioRules},cat={C},
902 }
903
904 @INPROCEEDINGS{bengio:1992:oban,
905 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
906 title = {On the Optimization of a Synaptic Learning rule},
907 booktitle = {Conference on Optimality in Biological and Artificial Networks},
908 year = {1992},
909 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_oban.ps},
910 abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia). Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
911 addressfr={Dallas, USA},topics={BioRules},cat={C},
912 }
913
914 @INPROCEEDINGS{bengio:1992:snowbird,
915 author = {Bengio, Yoshua},
916 title = {Representations Based on Articulatory Dynamics for Speech Recognition},
917 booktitle = {1992 Neural Networks for Computing Conference},
918 year = {1992},
919 topics={PriorKnowledge,Speech},cat={C},
920 }
921
922 @INPROCEEDINGS{bengio:1993:icann,
923 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
924 editor = {Gielen, S. and Kappen, B.},
925 title = {Generalization of a Parametric Learning Rule},
926 booktitle = {{ICANN} '93: Proceedings of the International Conference on Artificial Neural Networks},
927 year = {1993},
928 pages = {502},
929 publisher = {Springer-Verlag},
930 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1993_icann.ps},
931 abstract = {In previous work ([4,2,1]) we discussed the subject of parametric learning rules for neural networks. In this article, we present a theoretical basis permitting to study the generalization property of a learning rule whose parameters are estimated from a set of learning tasks. By generalization, we mean the possibility of using the learning rule to learn solve new tasks. Finally, we describe simple experiments on two-dimensional categorization tasks and show how they corroborate the theoretical results.},
932 addressfr={Amsterdam, Pays-Bas},topics={BioRules},cat={C},
933 }
934
935 @INPROCEEDINGS{bengio:1993:snowbird,
936 author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
937 title = {The Problem of Learning Long-Term Dependencies in Recurrent Networks},
938 booktitle = {1993 Neural Networks for Computing Conference},
939 year = {1993},
940 topics={LongTerm},cat={C},
941 }
942
943 @TECHREPORT{bengio:1994,
944 author = {Bengio, Yoshua and Frasconi, Paolo},
945 title = {An {EM} Approach to Learning Sequential Behavior},
946 number = {DSI 11-94},
947 year = {1994},
948 institution = {Universita di Firenze, Dipartimento di Sistemi e Informatica},
949 topics={LongTerm},cat={T},
950 }
951
952 @INPROCEEDINGS{bengio:1994:acfas,
953 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
954 title = {Optimisation d'une r{\`{e}}gle d'apprentissage pour r{\'{e}}seaux de neurones artificiels},
955 booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
956 year = {1994},
957 topics={BioRules},cat={C},
958 }
959
960 @INPROCEEDINGS{bengio:1994:snowbird,
961 author = {Bengio, Yoshua and Frasconi, Paolo},
962 title = {An {EM} Algorithm for Target Propagation},
963 booktitle = {1994 Neural Networks for Computing Conference},
964 year = {1994},
965 topics={LongTerm},cat={C},
966 }
967
968 @INPROCEEDINGS{bengio:1994:wcci,
969 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
970 title = {Use of Genetic Programming for the Search of a New Learning Rule for Neural Networks},
971 booktitle = {Proceedings of the First Conference on Evolutionary Computation, {IEEE} World Congress on Computational Intelligence},
972 year = {1994},
973 pages = {324--327},
974 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1994_wcci.ps},
975 abstract = {In previous work ([1,2,3]), we explained how to use standard optimization methods such as simulated annealing, gradient descent and genetic algorithms to optimize a parametric function which could be used as a learning rule for neural networks. To use these methods, we had to choose a fixed number of parameters and a rigid form for the learning rule. In this article, we propose to use genetic programming to find not only the values of rule parameters but also the optimal number of parameters and the form of the rule. Experiments on classification tasks suggest genetic programming finds better learning rules than other optimization methods. Furthermore, the best rule found with genetic programming outperformed the well-known backpropagation algorithm for a given set of tasks.},
976 topics={BioRules},cat={C},
977 }
978
979 @INPROCEEDINGS{bengio:1994b:acfas,
980 author = {Bengio, Yoshua and Frasconi, Paolo},
981 title = {R{\'{e}}seaux de neurones {M}arkoviens pour l'inf{\'{e}}rence grammaticale},
982 booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
983 year = {1994},
984 topics={Markov,Language},cat={C},
985 }
986
987 @ARTICLE{bengio:1995:npl,
988 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
989 title = {On the Search for New Learning Rules for {ANN}s},
990 journal = {Neural Processing Letters},
991 volume = {2},
992 number = {4},
993 year = {1995},
994 pages = {26--30},
995 abstract = {In this paper, we present a framework where a learning rule can be optimized within a parametric learning rule space. We define what we call parametric learning rules and present a theoretical study of their generalization properties when estimated from a set of learning tasks and tested over another set of tasks. We corroborate the results of this study with practical experiments.},
996 topics={BioRules},cat={J},
997 }
998
999 @INCOLLECTION{bengio:1995:oban,
1000 author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
1001 editor = {Levine, D. S. and Elsberry, W. R.},
1002 title = {{O}n the Optimization of a Synaptic Learning Rule},
1003 booktitle = {Optimality in Biological and Artificial Networks},
1004 year = {1995},
1005 publisher = {Lawrence Erlbaum},
1006 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1995_oban.pdf},
1007 abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia), Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
1008 topics={BioRules},cat={B},
1009 }
1010
1011 @TECHREPORT{bengio:1996:udem,
1012 author = {Bengio, Yoshua and Bengio, Samy},
1013 title = {Training Asynchronous Input/Output Hidden {M}arkov Models},
1014 number = {1013},
1015 year = {1996},
1016 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
1017 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1996_udem.ps},
1018 topics={Markov},cat={T},
1019 }
1020
1021 @INPROCEEDINGS{bengio:1997:snowbird,
1022 author = {Bengio, Yoshua and Bengio, Samy and Singer, Yoram and Isabelle, Jean-Fran{\c c}ois},
1023 title = {On the Clusterization of Probabilistic Transducers},
1024 booktitle = {1997 Neural Networks for Computing Conference},
1025 year = {1997},
1026 topics={HighDimensional},cat={C},
1027 }
1028
1029 @INPROCEEDINGS{bengio:1998:snowbird,
1030 author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
1031 title = {Stochastic Learning of Strategic Equilibria for Auctions},
1032 booktitle = {Learning Conference},
1033 year = {1998},
1034 topics={Auction},cat={C},
1035 }
1036
1037 @TECHREPORT{bengio:1998:udem,
1038 author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
1039 title = {Stochastic Learning of Strategic Equilibria for Auctions},
1040 number = {1119},
1041 year = {1998},
1042 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
1043 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1998_udem.pdf},
1044 abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
1045 topics={Auction},cat={T},
1046 }
1047
1048 @INPROCEEDINGS{bengio:1999:snowbird,
1049 author = {Bengio, Yoshua and Latendresse, Simon and Dugas, Charles},
1050 title = {Gradient-Based Learning of Hyper-Parameters},
1051 booktitle = {Learning Conference},
1052 year = {1999},
1053 topics={ModelSelection},cat={C},
1054 }
1055
1056 @INPROCEEDINGS{bengio:1999:titration,
1057 author = {Bengio, Yoshua and Brault, J-J. and Major, Fran{\c c}ois and Neal, R. and Pigeon, Steven},
1058 title = {Learning Algorithms for Sorting Compounds from Titration Curves},
1059 booktitle = {Symposium on New Perspectives for Computer-Aided Drug Design},
1060 year = {1999},
1061 topics={Speech},cat={C},
1062 }
1063
1064 @ARTICLE{bengio:2000:ieeetrnn,
1065 author = {Bengio, Samy and Bengio, Yoshua},
1066 title = {Taking on the Curse of Dimensionality in Joint Distributions Using Neural Networks},
1067 journal = {IEEE Transaction on Neural Networks special issue on data mining and knowledge discovery},
1068 volume = {11},
1069 number = {3},
1070 year = {2000},
1071 pages = {550--557},
1072 abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow at most as the square of the number of variables, using a multi_layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables (thus reducing significantly the number of parameters). Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
1073 topics={HighDimensional,Unsupervised,Mining},cat={J},
1074 }
1075
1076 @INPROCEEDINGS{bengio:2000:nips,
1077 author = {Bengio, Yoshua and Bengio, Samy},
1078 title = {Modeling High-Dimensional Discrete Data with Multi-Layer Neural Networks},
1079 year = {2000},
1080 pages = {400--406},
1081 crossref = {NIPS12-shorter},
1082 abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow only at most as the square of the number of variables, using a multi-layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables. Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
1083 topics={HighDimensional,Unsupervised},cat={C},
1084 }
1085
1086 @ARTICLE{bengio:2003,
1087 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
1088 title = {A Neural Probabilistic Language Model},
1089 volume = {3},
1090 year = {2003},
1091 pages = {1137--1155},
1092 journal = {Journal of Machine Learning Research},
1093 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
1094 topics={Markov,Unsupervised,Language},cat={J},
1095 }
1096
1097 @TECHREPORT{bengio:socs-1990,
1098 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
1099 title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
1100 number = {TR-SOCS-90.22},
1101 year = {1990},
1102 institution = {School of Computer Science, McGill University},
1103 topics={Markov},cat={T},
1104 }
1105
1106 @INPROCEEDINGS{bengioc:1994:acfas,
1107 author = {Bengio, Yoshua and {LeCun}, Yann},
1108 title = {Reconnaissance de mots manuscrits avec r{\'{e}}seaux de neurones et mod{\`{e}}les de {M}arkov},
1109 booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
1110 year = {1994},
1111 topics={Markov,Speech},cat={C},
1112 }
1113
1114 @TECHREPORT{Bengio_Bottou92,
1115 author = {Bengio, Yoshua and Bottou, {L{\'{e}}on}},
1116 title = {A New Approach to Estimating Probability Density Functions with Artificial Neural Networks},
1117 number = {TR-92.02},
1118 year = {1992},
1119 institution = {Massachusetts Institute of Technology, Dept. Brain and Cognitive Sciences},
1120 topics={HighDimensional},cat={T},
1121 }
1122
1123 @INCOLLECTION{bengio_extension_nips_2003,
1124 author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Delalleau, Olivier and Le Roux, Nicolas and Ouimet, Marie},
1125 keywords = {dimensionality reduction, eigenfunctions learning, Isomap, kernel {PCA}, locally linear embedding, Nystrom formula, spectral methods},
1126 title = {Out-of-Sample Extensions for {LLE}, Isomap, {MDS}, Eigenmaps, and Spectral Clustering},
1127 year = {2004},
1128 crossref = {NIPS16-shorter},
1129 abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides a unified framework for extending Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (for dimensionality reduction) as well as for Spectral Clustering. This framework is based on seeing these algorithms as learning eigenfunctions of a data-dependent kernel. Numerical experiments show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms due to the choice of training data.},
1130 topics={HighDimensional,Kernel,Unsupervised},cat={C},
1131 }
1132
1133 @ARTICLE{Bengio_Gingras98a,
1134 author = {Bengio, Yoshua and Gingras, Fran{\c c}ois and Goulard, Bernard and Lina, Jean-Marc},
1135 title = {Gaussian Mixture Densities for Classification of Nuclear Power Plant Data},
1136 journal = {Computers and Artificial Intelligence},
1137 volume = {17},
1138 number = {2-3},
1139 year = {1998},
1140 pages = {189--209},
1141 abstract = {In this paper we are concerned with the application of learning algorithms to the classification of reactor states in nuclear plants. Two aspects must be considered, (1) some types of events (e.g., abnormal or rare) will not appear in the data set, but the system should be able to detect them, (2) not only classification of signals but also their interpretation are important for nuclear plant monitoring. We address both issues with a mixture of mixtures of Gaussians in which some parameters are shared to reflect the similar signals observed in different states of the reactor. An {EM} algorithm for these shared Gaussian mixtures is presented. Experimental results on nuclear plant data demonstrate the advantages of the proposed approach with respect to the above two points.},
1142 topics={Mining},cat={J},
1143 }
1144
1145 @ARTICLE{Bengio_Gingras98b,
1146 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
1147 title = {Handling Asynchronous or Missing Financial Data with Recurrent Networks},
1148 journal = {International Journal of Computational Intelligence and Organizations},
1149 volume = {1},
1150 number = {3},
1151 year = {1998},
1152 pages = {154--163},
1153 abstract = {An important issue with many sequential data analysis problems, such as those encountered in financial data sets, is that different variables are known at different frequencies, at different times (asynchronicity), or are sometimes missing. To address this issue we propose to use recurrent networks with feedback into the input units, based on two fundamental ideas. The first motivation is that the “filled-in” value of the missing variable may not only depend in complicated ways on the value of this variable in the past of the sequence but also on the current and past values of other variables. The second motivation is that, for the purpose of making predictions or taking decisions, it is not always necessary to fill in the best possible value of the missing variables. In fact, it is sufficient to fill in a value which helps the system make better predictions or decisions. The advantages of this approach are demonstrated through experiments on several tasks.},
1154 topics={Finance,Missing},cat={J},
1155 }
1156
1157 @INPROCEEDINGS{Bengio_icassp90,
1158 author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Normandin, Yves},
1159 title = {A Hybrid Coder for Hidden {M}arkov Models Using a Recurrent Neural Network},
1160 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
1161 year = {1990},
1162 pages = {537--540},
1163 topics={Markov,Speech},cat={C},
1164 }
1165
1166 @INPROCEEDINGS{Bengio_LeCun94,
1167 author = {Bengio, Yoshua and {LeCun}, Yann and Henderson, Donnie},
1168 title = {Globally Trained Handwritten Word Recognizer using Spatial Representation, Space Displacement Neural Networks and Hidden {M}arkov Models},
1169 year = {1994},
1170 pages = {937--944},
1171 crossref = {NIPS6-shorter},
1172 abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
1173 topics={Speech},cat={C},
1174 }
1175
1176 @ARTICLE{Bengio_LeCun95,
1177 author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
1178 title = {LeRec: A {NN}/{HMM} Hybrid for On-Line Handwriting Recognition},
1179 journal = {Neural Computation},
1180 volume = {7},
1181 number = {6},
1182 year = {1995},
1183 pages = {1289--1303},
1184 abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
1185 topics={PriorKnowledge,Speech},cat={J},
1186 }
1187
1188 @ARTICLE{Bengio_prel92,
1189 author = {Bengio, Yoshua and Gori, Marco and De Mori, Renato},
1190 title = {Learning the Dynamic Nature of Speech with Back-propagation for Sequences},
1191 journal = {Pattern Recognition Letters},
1192 volume = {13},
1193 number = {5},
1194 year = {1992},
1195 pages = {375--385},
1196 note = {(Special issue on Artificial Neural Networks)},
1197 topics={Speech},cat={J},
1198 }
1199
1200 @ARTICLE{Bengio_trnn92,
1201 author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
1202 title = {Global Optimization of a Neural Network-Hidden {M}arkov Model Hybrid},
1203 journal = {IEEE Transactions on Neural Networks},
1204 volume = {3},
1205 number = {2},
1206 year = {1992},
1207 pages = {252--259},
1208 topics={Markov},cat={J},
1209 }
1210
1211 @TECHREPORT{Bergstra+2009,
1212 author = {Bergstra, James and Desjardins, Guillaume and Lamblin, Pascal and Bengio, Yoshua},
1213 title = {Quadratic Polynomials Learn Better Image Features},
1214 number = {1337},
1215 year = {2009},
1216 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
1217 abstract = {The affine-sigmoidal hidden unit (of the form $\sigma(ax+b)$)
1218 is a crude predictor of neuron response in visual area V1.
1219 More descriptive models of V1 have been advanced that are no more computationally expensive,
1220 yet artificial neural network research continues to focus on networks of affine-sigmoidal models.
1221 This paper identifies two qualitative differences between the affine-sigmoidal hidden unit
1222 and a particular recent model of V1 response:
1223 a) the presence of a low-rank quadratic term in the argument to $\sigma$,
1224 and b) the use of a gentler non-linearity than the $\tanh$ or logistic sigmoid.
1225 We evaluate these model ingredients by training single-layer
1226 neural networks to solve three image classification tasks.
1227 We experimented with fully-connected hidden units,
1228 as well as locally-connected units and convolutional units
1229 that more closely mimic the function and connectivity of the visual system.
1230 On all three tasks, both the quadratic interactions and the gentler non-linearity
1231 lead to significantly better generalization.
1232 The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.}
1233 }
1234
1235 @MISC{bergstra+al:2010-scipy,
1236 author = {Bergstra, James},
1237 title = {Optimized Symbolic Expressions and {GPU} Metaprogramming with Theano},
1238 year = {2010},
1239 howpublished = {{SciPy}},
1240 note = {Oral}
1241 }
1242
1243 @MISC{bergstra+al:2010-sharcnet,
1244 author = {Bergstra, James and Bengio, Yoshua},
1245 title = {{GPU} Programming with Theano},
1246 year = {2010},
1247 howpublished = {{SHARCNET} Research Day},
1248 note = {Oral}
1249 }
1250
1251 @MISC{bergstra+al:2010snowbird,
1252 author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Turian, Joseph and Desjardins, Guillaume and Pascanu, Razvan and Erhan, Dumitru and Delalleau, Olivier and Bengio, Yoshua},
1253 title = {Deep Learning on {GPU}s with Theano},
1254 booktitle = {The Learning Workshop},
1255 year = {2010},
1256 note = {Oral}
1257 }
1258
1259 @INPROCEEDINGS{Bergstra+Bengio-2009,
1260 author = {Bergstra, James and Bengio, Yoshua},
1261 title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
1262 year = {2009},
1263 crossref = {NIPS22}
1264 }
1265
1266 @ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006,
1267 author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1268 title = {Aggregate Features and AdaBoost for Music Classification},
1269 journal = {Machine Learning},
1270 volume = {65},
1271 year = {2006},
1272 pages = {473--484},
1273 issn = {0885-6125},
1274 abstract = {We present an algorithm that predicts musical genre and artist from an audio waveform. Our method uses the ensemble learner ADABOOST to select from a set of audio features that have been extracted from segmented audio and then aggregated. Our classifier proved to be the most effective method for genre classification at the recent MIREX 2005 international contests in music information extraction, and the second-best method for recognizing artists. This paper describes our method in detail, from feature extraction to song classification, and presents an evaluation of our method on three genre databases and two artist-recognition databases. Furthermore, we present evidence collected from a variety of popular features and classifiers that the technique of classifying features aggregated over segments of audio is better than classifying either entire songs or individual short-timescale features.},
1275 PDF = {papers/2006_ml_draft.pdf},
1276 SOURCE = {OwnPublication},
1277 }
1278
1279 @INPROCEEDINGS{bergstra+lacoste+eck:2006,
1280 author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas},
1281 title = {Predicting Genre Labels for Artists using FreeDB},
1282 booktitle = {Proc. 7th International Conference on Music Information Retrieval (ISMIR)},
1283 year = {2006},
1284 SOURCE = {OwnPublication},
1285 PDF = {papers/2006_ismir_freedb.pdf},
1286 }
1287
1288 @INPROCEEDINGS{bergstra+mandel+eck:2010,
1289 author = {Bergstra, James and Mandel, Michael and Eck, Douglas},
1290 title = {Scalable Genre and Tag Prediction with Spectral Covariance},
1291 booktitle = {{ISMIR}},
1292 year = {2010},
1293 note = {accepted}
1294 }
1295
1296 @MASTERSTHESIS{Bergstra-Msc-2006,
1297 author = {Bergstra, James},
1298 keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale},
1299 title = {Algorithms for Classifying Recorded Music by Genre},
1300 year = {2006},
1301 school = {Universit{\'{e}} de Montreal},
1302 abstract = {Ce m{\'{e}}moire traite le probl{\`{e}}me de la classification automatique de signaux musicaux par genre. Dans un premier temps, je pr{\'{e}}sente une technique utilisant l'apprentissage machine pour classifier des statistiques extraites sur des segments du signal sonore. Malgr{\'{e}} le fait que cette technique a d{\'{e}}j{\`{a}} {\'{e}}t{\'{e}} explor{\'{e}}e, mon m{\'{e}}moire est le premier {\`{a}} investiguer l'influence de la longueur et de la quantit{\'{e}} de ces segments sur le taux de classification. J'explore {\'{e}}galement l'importance d'avoir des segments contigus dans le temps. Les segments d'une {\`{a}} trois secondes apportent une meilleure performance, mais pour ce faire, ils doivent {\^{e}}tre suffisamment nombreux. Il peut m{\^{e}}me {\^{e}}tre utile d'augmenter la quantit{\'{e}} de segments jusqu'{\`{a}} ce qu'ils se chevauchent. Dans les m{\^{e}}mes exp{\'{e}}riences, je pr{\'{e}}sente une formulation alternative des descripteurs d'audio nomm{\'{e}}e Melfrequency Cepstral Coefficient (MFCC) qui am{\`{e}}ne un taux de classification de 81 \% sur un jeux de donn{\'{e}}es pour lequel la meilleure performance publi{\'{e}}e est de 71 \%. Cette m{\'{e}}thode de segmentation des chansons, ainsi que cette formulation alternative, ont pour but d'am{\'{e}}liorer l'algorithme gagnant du concours de classification de genre de MIREX 2005, d{\'{e}}velopp{\'{e}} par Norman Casagrande et moi. Ces exp{\'{e}}riences sont un approfondissement du travail entam{\'{e}} par Bergstra et al. [2006a], qui d{\'{e}}crit l'algorithme gagnant de ce concours.
1303 Dans un deuxi{\`{e}}me temps, je pr{\'{e}}sent une m{\'{e}}thode qui utilise FreeDB, une base de donn{\'{e}}es d'information sur les albums, pour attribuer {\`{a}} un artiste une distribution de probabilit{\'{e}} sur son genre. Avec une petite base de donn{\'{e}}es, faite {\`{a}} la main, je montre qu'il y a une haute corr{\'{e}}lation entre cette distribution et l'{\'{e}}tiquette de genre traditionnel. Bien qu'il reste {\`{a}} d{\'{e}}montrer que cette m{\'{e}}thode est utile pour organiser une collection de musique, ce r{\'{e}}sultat sugg{\`{e}}re qu'on peut maintenant {\'{e}}tiqueter de grandes bases de musique automatiquement {\`{a}} un faible co{\^{u}}t, et par cons{\'{e}}quent de poursuivre plus facilement la recherche en classification {\`{a}} grande {\'{e}}chelle. Ce travail sera publi{\'{e}} comme Bergstra et al. [2006b] {\`{a}} ISMIR 2006.}
1304 }
1305
1306 @INPROCEEDINGS{bergstra:2010cosyne,
1307 author = {Bergstra, James and Bengio, Yoshua and Lamblin, Pascal and Desjardins, Guillaume and Louradour, Jerome},
1308 title = {Image classification with complex cell neural networks},
1309 booktitle = {Computational and systems neuroscience (COSYNE)},
1310 year = {2010},
1311 note = {Poster},
1312 url = {http://www.frontiersin.org/conferences/individual_abstract_listing.php?conferid=770&pap=3626&ind_abs=1&pg=335},
1313 doi = {10.3389/conf.fnins.2010.03.00334}
1314 }
1315
1316 @INPROCEEDINGS{biaslearn:2000:ijcnn,
1317 author = {Ghosn, Joumana and Bengio, Yoshua},
1318 title = {Bias Learning, Knowledge Sharing},
1319 booktitle = {International Joint Conference on Neural Networks 2000},
1320 volume = {I},
1321 year = {2000},
1322 pages = {9--14},
1323 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/ijcnn_manifold.pdf},
1324 abstract = {Biasing the hypothesis space of a learner has been shown to improve generalisation performances. Methods for achieving this goal have been proposed, that range from deriving and introducing a bias into a learner to automatically learning the bias. In the latter case, most methods learn the bias by simultaneously training several related tasks derived from the same domain and imposing constraints on their parameters. We extend some of the ideas presented in this field and describe a new model that parameterizes the parameters of each task as a function of an affine manifold defined in parameter space and a point lying on the manifold. An analysis of variance on a class of learning tasks is performed that shows some significantly improved performances when using the model.},
1325 topics={MultiTask},cat={C},
1326 }
1327
1328 @ARTICLE{biaslearn:2003:tnn,
1329 author = {Ghosn, Joumana and Bengio, Yoshua},
1330 title = {Bias Learning, Knowledge Sharing},
1331 journal = {IEEE Transaction on Neural Networks},
1332 volume = {14},
1333 number = {4},
1334 year = {2003},
1335 pages = {748--765},
1336 abstract = {Biasing properly the hypothesis space of a learner has been shown to improve generalization performance. Methods for achieving this goal have been proposed, that range from designing and introducing a bias into a learner to automatically learning the bias. Multitask learning methods fall into the latter category. When several related tasks derived from the same domain are available, these methods use the domain-related knowledge coded in the training examples of all the tasks as a source of bias. We extend some of the ideas presented in this field and describe a new approach that identifies a family of hypotheses, represented by a manifold in hypothesis space, that embodies domain-related knowledge. This family is learned using training examples sampled from a group of related tasks. Learning models trained on these tasks are only allowed to select hypotheses that belong to the family. We show that the new approach encompasses a large variety of families which can be learned. A statistical analysis on a class of related tasks is performed that shows significantly improved performances when using this approach.},
1337 topics={MultiTask},cat={J},
1338 }
1339
1340 @MASTERSTHESIS{Boisvert-Mcs-2005,
1341 author = {Boisvert, Maryse},
1342 keywords = {Algorithme {EM} , D{\'{e}}composition en valeurs singuli{\`{e}}res , D{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique , Mod{\`{e}}les graphiques, WordNet },
1343 title = {R{\'{e}}duction de dimension pour mod{\`{e}}les graphiques probabilistes appliqu{\'{e}}s {\`{a}} la d{\'{e}}sambiguisation s{\'{e}}mantique},
1344 year = {2005},
1345 school = {Universit{\'{e}} de Montr{\'{e}}al}
1346 }
1347
1348 @INPROCEEDINGS{bonneville98,
1349 author = {Bonneville, Martin and Meunier, Jean and Bengio, Yoshua and Soucy, Jean-Paul},
1350 title = {Support Vector Machines for Improving the classification of Brain Pet Images},
1351 booktitle = {SPIE Medical Imaging},
1352 year = {1998},
1353 topics={Kernel},cat={C},
1354 }
1355
1356 @INPROCEEDINGS{Bottou+Bengio95,
1357 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua},
1358 title = {Convergence Properties of the {K}-Means Algorithm},
1359 year = {1995},
1360 pages = {585--592},
1361 crossref = {NIPS7-shorter},
1362 abstract = {This paper studies the convergence properties of the well known K-Means clustering algorithm. The K-Means algorithm can be described either as a gradient descent algorithm or by slightly extending the mathematics of the {EM} algorithm to this hard threshold case. We show that the K-Means algorithm actually minimizes the quantization error using the very fast Newton algorithm.},
1363 topics={Unsupervised},cat={C},
1364 }
1365
1366 @ARTICLE{bottou-98,
1367 author = {Bottou, {L{\'{e}}on} and Haffner, Patrick and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
1368 title = {High Quality Document Image Compression with {DjVu}},
1369 journal = {Journal of Electronic Imaging},
1370 volume = {7},
1371 number = {3},
1372 year = {1998},
1373 pages = {410--425},
1374 topics={Compression},cat={J},
1375 }
1376
1377 @INPROCEEDINGS{Bottou-dcc98,
1378 author = {Bottou, {L{\'{e}}on} and G. Howard, Paul and Bengio, Yoshua},
1379 editor = {Society, {IEEE} Computer},
1380 title = {The Z-Coder Adaptive Binary Coder},
1381 booktitle = {Data Compression Conference},
1382 year = {1998},
1383 url = {http://leon.bottou.org/papers/bottou-howard-bengio-98},
1384 topics={Compression},cat={C},
1385 }
1386
1387 @INPROCEEDINGS{bottou-lecun-bengio-97,
1388 author = {Bottou, {L{\'{e}}on} and {LeCun}, Yann and Bengio, Yoshua},
1389 title = {Global Training of Document Processing Systems using Graph Transformer Networks},
1390 booktitle = {Proc. of Computer Vision and Pattern Recognition},
1391 year = {1997},
1392 pages = {490--494},
1393 publisher = {IEEE},
1394 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.ps.gz},
1395 topics={PriorKnowledge,Speech},cat={C},
1396 }
1397
1398 @TECHREPORT{bottou96TR,
1399 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann},
1400 title = {Document analysis with transducers},
1401 number = {Technical Memorandum HA615600-960701-01TM},
1402 year = {1996},
1403 institution = {AT\&T Labs},
1404 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/transducer-tm.ps.gz},
1405 topics={HighDimensional},cat={T},
1406 }
1407
1408 @TECHREPORT{bottou97TR,
1409 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and G. Howard, Paul},
1410 title = {Z-Coder: A Fast Adaptive Binary Arithmetic Coder},
1411 number = {Technical Memorandum HA615600-970721-02TM},
1412 year = {1997},
1413 institution = {AT\&T Labs},
1414 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/zcoder-tm.ps.gz},
1415 topics={Compression},cat={T},
1416 }
1417
1418 @MASTERSTHESIS{Bouchard-Msc-2007,
1419 author = {Bouchard, Lysiane},
1420 keywords = {auditory cortex, fMRI, linear classifier, logistic regression, na{\"{\i}}ve bayesian gaussian model, neuroimaging, spectro-temporal modulation, support vectors machine},
1421 title = {Analyse par apprentissage automatique des r{\'{e}}ponses fMRI du cortex auditif {\`{a}} des modulations spectro-temporelles.},
1422 year = {2009},
1423 school = {Universit{\'{e}} de Montr{\'{e}}al},
1424 abstract = {The application of linear machine learning classifiers to the analysis of brain imaging data (fMRI) has led to several interesting breakthroughs in recent years. These classifiers combine the responses of the voxels to detect and categorize different brain states. They allow a more agnostic analysis than conventional fMRI analysis that systematically treats weak and distributed patterns as unwanted noise. In this project, we use such classifiers to validate an hypothesis concerning the encoding of sounds in the human brain. More precisely, we attempt to locate neurons tuned to spectral and temporal modulations in sound. We use fMRI recordings of brain responses of subjects listening to 49 different spectro-temporal modulations. The analysis of fMRI data through linear classifiers is not yet a standard procedure in this field. Thus, an important objective of this project, in the long term, is the development of new machine learning algorithms specialized for neuroimaging data. For these reasons, an important part of the experiments is dedicated to studying the behaviour of the classifiers. We are mainly interested in 3 standard linear classifiers, namely the support vectors machine algorithm (linear), the logistic regression algorithm (regularized) and the na{\"{\i}}ve bayesian gaussian model (shared variances).}
1425 }
1426
1427 @PHDTHESIS{Boufaden-Phd-2005,
1428 author = {Boufaden, Narj{\`{e}}s},
1429 title = {Extraction d’information {\`{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es},
1430 year = {2005},
1431 school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel}
1432 }
1433
1434 @INPROCEEDINGS{Carreau+Bengio-2007,
1435 author = {Carreau, Julie and Bengio, Yoshua},
1436 title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data},
1437 booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
1438 year = {2007},
1439 publisher = {Omnipress},
1440 abstract = {We propose an estimator for the conditional density p(Y|X) that can adapt for asymmetric heavy tails which might depend on X. Such estimators have important applications in finance and insurance. We draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional density estimator, the parameters of the mixture estimator can be seen as
1441 functions of X and these functions learned. We show experimentally that this approach better models the conditional density in terms of likelihood than compared competing algorithms : conditional mixture models with other types of components and multivariate nonparametric models.},
1442 date={21-24}
1443 }
1444
1445 @ARTICLE{Carreau+Bengio-2009,
1446 author = {Carreau, Julie and Bengio, Yoshua},
1447 title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distributio\ n},
1448 journal = {IEEE Transactions on Neural Networks},
1449 volume = {20},
1450 number = {7},
1451 year = {2009},
1452 pages = {1087--1101},
1453 issn = {1045-9227},
1454 abstract = {In many cases, we observe some variables X that contain predictive information over a scalar variable of interest Y, with (X,Y) pairs observed in a training set. We can take advantage of this information to estimate the conditional density P(Y\X = x). In this paper, we propose a conditional mixture model with hybrid {Pareto} components to estimate P(Y\X = x).The hybrid {Pareto} is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. A third parameter, in addition to the location and spread parameters of the Gaussian, controls the heaviness of the upper tail. Using the hybrid {Pareto} in a mixture model results in a nonparametric estimator that can adapt to multimodality, asymmetry, and heavy tails. A conditional density estimator is built by modeling the parameters of the mixture estimator as functions of X. We use a neural network to implement these functions. Such conditional density estimators have important applications in many domains such as finance and insurance. We show experimentally that this novel approach better models the conditional density in terms of likelihood, compared to competing algorithms: conditional mixture models with other types of components and a classical kernel-based nonparametric model.}
1455 }
1456
1457 @ARTICLE{Carreau+Bengio-extreme-2009,
1458 author = {Carreau, Julie and Bengio, Yoshua},
1459 title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tailed Data: the univariate case},
1460 journal = {Extremes},
1461 volume = {12},
1462 number = {1},
1463 year = {2009},
1464 pages = {53--76},
1465 abstract = {Density estimators that can adapt to asymmetric heavy tails are required in many applications such as finance and insurance. Extreme Value Theory (EVT) has developped principled methods based on asymptotic results to estimate the tails of most distributions. However, the finite sample approximation might introduce a severe bias in many cases. Moreover, the full range of the distribution is often needed, not only the tail area. On the other hand, non-parametric methods, while being powerful where data are abundant, fail to extrapolate properly in the tail area. We put forward a non-parametric density estimator that brings together the strengths of non-parametric density estimation and of EVT. A hybrid {Pareto} distribution that can be used in a mixture model is proposed to extend the generalized {Pareto} (GP) to the whole real axis. Experiments on simulated data show the following. On one hand, the mixture of hybrid {Pareto}s converges faster in terms of log-likelihood and provides good estimates of the tail of the distributions when compared with other density estimators including the GP distribution. On the other hand, the mixture of hybrid {Pareto}s offers an alternate way to estimate the tail index which is comparable to the one estimated with the standard GP methodology. The mixture of hybrids is also evaluated on the Danish fire insurance data set.}
1466 }
1467
1468 @PHDTHESIS{Carreau-PhD-2007,
1469 author = {Carreau, Julie},
1470 keywords = {density estimation, extreme values, generalized {Pareto} distribution, heavy-tailed distribution, mixture of distributions, neural networks},
1471 title = {Mod{\`{e}}les {Pareto} hybrides pour distributions asym{\'{e}}triques et {\`{a}} queues lourdes},
1472 year = {2007},
1473 school = {UdeM},
1474 abstract = {We put forward a class of density estimators that can adapt to asymmetric, multi-modal and heavy-tailed distributions. Such distributions occur in many application domains such as finance and insurance. Mixture of gaussians are flexible non-parametric density estimators that have good approximation properties when the number of components is well chosen with respect to the training set size. However, those models are performing poorly on heavy-tailed data because few observations occur in the tail area. To solve this problem, we resort to extreme value theory where methods based on sound parametric assumptions have been developped to enable extrapolation beyond the range of the observations. More precisely, we build on the PoT method that was developped in hydrology where PoT stands for "Peaks-over-Threshold". The observations exceeding a given threshold are modeled by the generalized {Pareto} distribution. This distribution can approximate arbitrarily well the tail of most distributions. We build a new distribution, the hybrid {Pareto}, by stitching together a truncated Normal and a generalized {Pareto} distribution. We impose continuity constraints at the junction point. The hybrid {Pareto} is thus a smooth distribution that can be used in a mixture model. The behavior of the upper tail of the hybrid is similar to the behavior of the generalized {Pareto} tail. Moreover, the threshold inherent in the the PoT methodology can now be defined implicitly as the junction point of the component with the heaviest tail. This component also determines the tail index of the mixture. Hence, the hybrid {Pareto} mixture offers an alternate way to estimate the tail index associated with heavy-tailed data. In several applications, information that has predictive power on the variable of interest is available. In that case, we want to model the conditional density of Y given X, the vector containing predictive information. When the distribution of Y given X is asymmetric, multi-modal and heavy-tailed, we propose to use a mixure of hybrid {Pareto}s whose parameters are functions of X. Those functions are implemented by means of a neural network with one hidden layer. Neural neworks are non-parametric models that can, in principle, approximate any continuous function. Experiments on artificial and real data sets show that the hybrid {Pareto} mixture, unconditional and conditional, outperforms other density estimators in terms of log-likelihood.}
1475 }
1476
1477 @INPROCEEDINGS{casagrande+eck+kegl:icmc2005,
1478 author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1479 title = {Geometry in Sound: A Speech/Music Audio Classifier Inspired by an Image Classifier},
1480 booktitle = {{Proceedings of the International Computer Music Conference (ICMC)}},
1481 year = {2005},
1482 pages = {207--210},
1483 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_icmc_casagrande.pdf},
1484 source={OwnPublication},
1485 sourcetype={Conference},
1486 }
1487
1488 @INPROCEEDINGS{casagrande+eck+kegl:ismir2005,
1489 author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1490 title = {Frame-Level Audio Feature Extraction using {A}da{B}oost},
1491 booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
1492 year = {2005},
1493 pages = {345--350},
1494 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir_casagrande.pdf},
1495 source={OwnPublication},
1496 sourcetype={Conference},
1497 }
1498
1499 @PROCEEDINGS{ccai2006,
1500 editor = {Lamontagne, Luc and Marchand, Mario},
1501 title = {Advances in Artificial Intelligence, 19th Conference of the Canadian Society for Computational Studies of Intelligence, Canadian AI 2006, Qu{\'{e}}bec City, Qu{\'{e}}bec, Canada, June 7-9, 2006, Proceedings},
1502 booktitle = {Canadian Conference on AI},
1503 series = {Lecture Notes in Computer Science},
1504 volume = {4013},
1505 year = {2006},
1506 publisher = {Springer}
1507 }
1508
1509 @INPROCEEDINGS{Chapados+Bengio-2006,
1510 author = {Chapados, Nicolas and Bengio, Yoshua},
1511 title = {The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization},
1512 booktitle = {AI06},
1513 year = {2006},
1514 pages = {491-502}
1515 }
1516
1517 @INPROCEEDINGS{Chapados+Bengio-2007,
1518 author = {Chapados, Nicolas and Bengio, Yoshua},
1519 title = {Forecasting Commodity Contract Spreads with Gaussian Process},
1520 booktitle = {13th Intarnational Conference on Computing in Economics and Finance},
1521 year = {2007},
1522 abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
1523 }
1524
1525 @ARTICLE{Chapados+Bengio-2008-JOC,
1526 author = {Chapados, Nicolas and Bengio, Yoshua},
1527 title = {Noisy K Best-Paths for Approximate Dynamic Programming with Application to Portfolio Optimization},
1528 journal = {Journal of Computers},
1529 volume = {2},
1530 number = {1},
1531 year = {2007},
1532 pages = {12--19},
1533 abstract = {We describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-bestpaths algorithm. We consider an application in financial portfolio management where we can train a controller to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating experimental results using a kernel-based controller architecture that would not normally be considered in traditional
1534 reinforcement learning or approximate dynamic programming.We further show that using a non-additive criterion (incremental Sharpe Ratio) yields a noisy K-best-paths extraction problem, that can give substantially improved performance.}
1535 }
1536
1537 @MASTERSTHESIS{Chapados-Msc-2000,
1538 author = {Chapados, Nicolas},
1539 title = {Crit{\`{e}}res d'optimisation d'algorithmes d'apprentissage en gestion de portefeuille},
1540 year = {2000},
1541 school = {Universit{\'{e}} de Montr{\'{e}}al}
1542 }
1543
1544 @INPROCEEDINGS{chapados2000,
1545 author = {Chapados, Nicolas and Bengio, Yoshua},
1546 title = {Cost Functions and Model Combination for {VaR}-Based Asset Allocation Using Neural Networks},
1547 booktitle = {Computational Finance 2000},
1548 year = {2000},
1549 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/compfin2000_final.pdf},
1550 abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees using both paradigms are significantly outperforming the benchmark market performance.},
1551 topics={Finance},cat={C},
1552 }
1553
1554 @ARTICLE{chapados:2001,
1555 author = {Chapados, Nicolas and Bengio, Yoshua},
1556 title = {Cost Functions and Model Combination for VaR--based Asset Allocation using Neural Networks},
1557 journal = {IEEE Transactions on Neural Networks},
1558 volume = {12},
1559 number = {4},
1560 year = {2001},
1561 pages = {890--906},
1562 abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we
1563 compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees
1564 using both paradigms are significantly outperforming the benchmark market performance.},
1565 topics={Finance},cat={J},
1566 }
1567
1568 @ARTICLE{chapados:2003,
1569 author = {Bengio, Yoshua and Chapados, Nicolas},
1570 title = {Extensions to Metric-Based Model Selection},
1571 year = {2003},
1572 journal = {Journal of Machine Learning Research},
1573 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
1574 topics={ModelSelection,Finance},cat={J},
1575 }
1576
1577 @ARTICLE{chapelle:2001,
1578 author = {Chapelle, Olivier and Vapnik, Vladimir and Bengio, Yoshua},
1579 title = {Model Selection for Small Sample Regression},
1580 journal = {Machine Learning},
1581 year = {2001},
1582 abstract = {Model selection is an important ingredient of many machine learning algorithms, in particular when the sample size in small, in order to strike the right trade-off between overfitting and underfitting. Previous classical results for linear regression are based on an asymptotic analysis. We present a new penalization method for performing model selection for regression that is appropriate even for small samples. Our penalization is based on an accurate estimator of the ratio of the expected training error and the expected generalization error, in terms of the expected eigenvalues of the input covariance matrix.},
1583 topics={ModelSelection},cat={J},
1584 }
1585
1586 @INCOLLECTION{chapter-eval-longterm-2001,
1587 author = {Schmidhuber, Juergen and Hochreiter, Sepp and Bengio, Yoshua},
1588 editor = {Kolen, J. and Kremer, S.},
1589 title = {Evaluating Benchmark Problems by Random Guessing},
1590 booktitle = {Field Guide to Dynamical Recurrent Networks},
1591 year = {2001},
1592 publisher = {IEEE Press},
1593 topics={LongTerm},cat={B},
1594 }
1595
1596 @INCOLLECTION{chapter-gradient-document-2001,
1597 author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
1598 editor = {Haykin, S. and Kosko, B.},
1599 title = {Gradient-Based Learning Applied to Document Recognition},
1600 booktitle = {Intelligent Signal Processing},
1601 year = {2001},
1602 pages = {306--351},
1603 publisher = {IEEE Press},
1604 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-01a.pdf},
1605 abstract = {Multilayer Neural Networks trained with a backprppagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques.
1606 Real-life document recognition systems are composed of multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to monimize an overall peformance measure.
1607 Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks.
1608 A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with a global training technique to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
1609 topics={PriorKnowledge,Speech},cat={B},
1610 }
1611
1612 @INCOLLECTION{chapter-gradient-flow-2001,
1613 author = {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo},
1614 editor = {Kolen, J. and Kremer, S.},
1615 title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies},
1616 booktitle = {Field Guide to Dynamical Recurrent Networks},
1617 year = {2001},
1618 publisher = {IEEE Press},
1619 topics={LongTerm},cat={B},
1620 }
1621
1622 @INPROCEEDINGS{chemero+eck:1999,
1623 author = {Chemero, T. and Eck, Douglas},
1624 title = {An Exploration of Representational Complexity via Coupled Oscillators},
1625 booktitle = {{Proceedings of the Tenth Midwest Artificial Intelligence and Cognitive Science Society}},
1626 year = {1999},
1627 publisher = {MIT Press},
1628 url = {http://www.iro.umontreal.ca/~eckdoug/papers/1999_chemero.pdf},
1629 abstract = {We note some inconsistencies in a view of representation which takes {\it decoupling} to be of key importance. We explore these inconsistencies using examples of representational vehicles taken from coupled oscillator theory and suggest a new way to reconcile {\it coupling} with {\it absence}. Finally, we tie these views to a teleological definition of representation.},
1630 source={OwnPublication},
1631 sourcetype={Conference},
1632 }
1633
1634 @ARTICLE{ChemInfModel2006,
1635 author = {Erhan, Dumitru and {L'Heureux}, Pierre-Jean and Yue, Shi Yi and Bengio, Yoshua},
1636 title = {Collaborative Filtering on a Family of Biological Targets},
1637 journal = {J. Chem. Inf. Model.},
1638 volume = {46},
1639 number = {2},
1640 year = {2006},
1641 pages = {626--635},
1642 abstract = {Building a QSAR model of a new biological target for which few screening data are available is a statistical
1643 challenge. However, the new target may be part of a bigger family, for which we have more screening data.
1644 Collaborative filtering or, more generally, multi-task learning, is a machine learning approach that improves
1645 the generalization performance of an algorithm by using information from related tasks as an inductive
1646 bias. We use collaborative filtering techniques for building predictive models that link multiple targets to
1647 multiple examples. The more commonalities between the targets, the better the multi-target model that can
1648 be built. We show an example of a multi-target neural network that can use family information to produce
1649 a predictive model of an undersampled target. We evaluate JRank, a kernel-based method designed for
1650 collaborative filtering. We show their performance on compound prioritization for an HTS campaign and
1651 the underlying shared representation between targets. JRank outperformed the neural network both in the
1652 single- and multi-target models.},
1653 topics={Bioinformatic,MultiTask},cat={J},
1654 }
1655
1656 @TECHREPORT{collobert:2001:rr01-12,
1657 author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
1658 title = {A Parallel Mixture of {SVM}s for Very Large Scale Problems},
1659 number = {12},
1660 year = {2001},
1661 institution = {IDIAP},
1662 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/IDIAP-RR-01-12.ps},
1663 abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
1664 topics={Kernel},cat={T},
1665 }
1666
1667 @ARTICLE{collobert:2002,
1668 author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
1669 title = {Parallel Mixture of {SVM}s for Very Large Scale Problem},
1670 journal = {Neural Computation},
1671 year = {2002},
1672 abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
1673 topics={HighDimensional,Kernel},cat={J},
1674 }
1675
1676 @BOOK{collobert:2002:book,
1677 author = {Collobert, Ronan and Bengio, Yoshua and Bengio, Samy},
1678 editor = {Lee, S. W. and Verri, A.},
1679 title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
1680 booktitle = {Pattern Recognition with Support Vector Machines},
1681 series = {Lecture Notes in Computer Science},
1682 volume = {2388},
1683 year = {2002},
1684 publisher = {Springer-Verlag},
1685 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/2002_mixtures_svm.pdf},
1686 abstract = {A challenge for statistical learning is to deal with large data sets, e.g. in data mining. Popular learning algorithms such as Support Vector Machines have training time at least quadratic in the number of examples: they are hopeless to solve prolems with a million examples. We propose a "hard parallelizable mixture" methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a "gater" model in such a way that it becoms easy to learn an "expert" model separately in each region of the parition. A probabilistic extension and the use of a set of generative models allows representing a gater so that all pieces of the model are locally trained. For {SVM}s, time complexity appears empirically to locally grow linearly with the number of examples, while generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.},
1687 topics={Kernel},cat={B},
1688 }
1689
1690 @MISC{copyright-CTAI,
1691 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Dorion, Christian},
1692 title = {Commodity Trading Advisor Index},
1693 year = {2004-2009},
1694 howpublished = {copyright, and commercialized software license.}
1695 }
1696
1697 @MISC{copyright-PLearn,
1698 author = {Vincent, Pascal and Bengio, Yoshua},
1699 title = {{PLearn}, a {C++} Machine Learning Library},
1700 year = {1998-2009},
1701 howpublished = {copyright, public domain license.},
1702 url = {www.plearn.org}
1703 }
1704
1705 @ARTICLE{Cosi90,
1706 author = {Cosi, Piero and Bengio, Yoshua and De Mori, Renato},
1707 title = {Phonetically-based multi-layered networks for acoustic property extraction and automatic speech recognition},
1708 journal = {Speech Communication},
1709 volume = {9},
1710 number = {1},
1711 year = {1990},
1712 pages = {15--30},
1713 topics={PriorKnowledge,Speech},cat={J},
1714 }
1715
1716 @INCOLLECTION{courville+eck+bengio:nips2009,
1717 author = {Courville, Aaron and Eck, Douglas and Bengio, Yoshua},
1718 editor = {},
1719 title = {An Infinite Factor Model Hierarchy Via a Noisy-Or Mechanism},
1720 booktitle = {Neural Information Processing Systems Conference (NIPS) 22},
1721 year = {2009},
1722 pages = {405--413},
1723 publisher = {},
1724 url = {http://books.nips.cc/papers/files/nips22/NIPS2009_1100.pdf},
1725 source={OwnPublication},
1726 sourcetype={Conference},
1727 pdf={""},
1728 }
1729
1730 @INPROCEEDINGS{davies+plumbley+eck:waspaa2009,
1731 author = {Davies, M. and Plumbley, M. and Eck, Douglas},
1732 title = {Towards a musical beat emphasis function},
1733 booktitle = {Proceedings of IEEE WASPAA},
1734 year = {2009},
1735 organization = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
1736 source={OwnPublication},
1737 sourcetype={Conference},
1738 }
1739
1740 @INPROCEEDINGS{Delalleau+al-2005,
1741 author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
1742 editor = {Cowell, Robert G. and Ghahramani, Zoubin},
1743 title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
1744 booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)},
1745 year = {2005},
1746 pages = {96--103},
1747 publisher = {Society for Artificial Intelligence and Statistics},
1748 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/semisup_aistats2005.pdf},
1749 abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed nonparametric algorithms which provide an estimated continuous label for the given unlabeled examples. First, it extends them to function induction algorithms that minimize a regularization criterion applied to an out-of-sample example, and happen to have the form of Parzen windows regressors. This allows to predict test labels without solving again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Second, this function induction procedure gives rise to an efficient approximation of the training process, reducing the linear system to be solved to m << n unknowns, using only a subset of m examples. An improvement of O(n^2/m^2) in time can thus be obtained. Comparative experiments are presented, showing the good performance of the induction formula and approximation algorithm.},
1750 topics={Unsupervised},cat={C},
1751 }
1752
1753 @INCOLLECTION{Delalleau+al-ssl-2006,
1754 author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
1755 editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
1756 title = {Large-Scale Algorithms},
1757 booktitle = {Semi-Supervised Learning},
1758 year = {2006},
1759 pages = {333--341},
1760 publisher = {{MIT} Press},
1761 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/delalleau_ssl.pdf},
1762 abstract = {In Chapter 11, it is shown how a number of graph-based semi-supervised learning
1763 algorithms can be seen as the minimization of a specific cost function, leading to a
1764 linear system with n equations and unknowns (with n the total number of labeled
1765 and unlabeled examples). Solving such a linear system will in general require on the
1766 order of O(kn2) time and O(kn) memory (for a sparse graph where each data point
1767 has k neighbors), which can be prohibitive on large datasets (especially if k = n,
1768 i.e. the graph is dense). We present in this chapter a subset selection method that
1769 can be used to reduce the original system to one of size m << n. The idea is to solve
1770 for the labels of a subset S of X of only m points, while still retaining information
1771 from the rest of the data by approximating their label with a linear combination of
1772 the labels in S (using the induction formula presented in Chapter 11). This leads
1773 to an algorithm whose computational requirements scale as O(m2n) and memory
1774 requirements as O(m2), thus allowing one to take advantage of significantly bigger
1775 unlabeled datasets than with the original algorithms.},
1776 cat={B},topics={Unsupervised},
1777 }
1778
1779 @INCOLLECTION{DeMori90a,
1780 author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
1781 editor = {Mohr, R. and Pavlidis, T. and Sanfelin, A.},
1782 title = {On the use of an ear model and multi-layer networks for automatic speech recognition},
1783 booktitle = {Structural Pattern Analysis},
1784 year = {1990},
1785 publisher = {World Scientific},
1786 topics={PriorKnowledge,Speech},cat={B},
1787 }
1788
1789 @INPROCEEDINGS{Desjardins+al-2010,
1790 author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua},
1791 title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine},
1792 booktitle = {Proceedings of AISTATS 2010},
1793 volume = {9},
1794 year = {2010},
1795 pages = {145-152},
1796 abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.}
1797 }
1798
1799 @TECHREPORT{Desjardins-2008,
1800 author = {Desjardins, Guillaume and Bengio, Yoshua},
1801 keywords = {Convolutional Architectures, Deep Networks, RBM, Vision},
1802 title = {Empirical Evaluation of Convolutional RBMs for Vision},
1803 number = {1327},
1804 year = {2008},
1805 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
1806 abstract = {Convolutional Neural Networks ({CNN}) have had great success in machine learning tasks involving vision and represent one of the early successes of deep networks. Local receptive fields and weight
1807 sharing make their architecture ideally suited for vision tasks by helping to enforce a prior based on our knowledge of natural images. This same prior could also be applied to recent developments in the field of deep networks, in order to tailor these new architectures for artificial vision. In this context, we show how the Restricted {Boltzmann} Machine (RBM), the building block of Deep Belief Networks (DBN), can be adapted to operate in a convolutional manner. We compare their performance to standard fully-connected RBMs on a simple visual learning task and show that the convolutional RBMs (CRBMs) converge to smaller values of the negative likelihood function. Our experiments also indicate that CRBMs are more efficient than standard RBMs trained on small image patches, with the CRBMs having faster convergence.}
1808 }
1809
1810 @TECHREPORT{Desjardins-tech-2009,
1811 author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal and Delalleau, Olivier},
1812 keywords = {CD, PCD, RBM, simulated tempering, tempered MCMC, unsupervised learning},
1813 title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machines},
1814 number = {1345},
1815 year = {2009},
1816 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
1817 abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we
1818 explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood that it helps both sampling and learning.}
1819 }
1820
1821 @ARTICLE{Dugas+Bengio-2009,
1822 author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
1823 title = {Incorporating Functional Knowledge in Neural Networks},
1824 journal = {The Journal of Machine Learning Research},
1825 volume = {10},
1826 year = {2009},
1827 pages = {1239--1262},
1828 abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in its two arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of Lipschitz functions with these and other properties. We apply this new class of functions to the task of modelling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.}
1829 }
1830
1831 @PHDTHESIS{Dugas-Phd-2003,
1832 author = {Dugas, Charles},
1833 title = {Les algorithmes d'apprentissage appliqu{\'{e}}s aux risques financiers},
1834 year = {2003},
1835 school = {Universit{\'{e}} de Montr{\'{e}}al}
1836 }
1837
1838 @ARTICLE{dugas:2003,
1839 author = {Dugas, Charles and Bengio, Yoshua and Chapados, Nicolas and Vincent, Pascal and Denoncourt, Germain and Fournier, Christian},
1840 title = {Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking},
1841 journal = {CAS Forum},
1842 volume = {1},
1843 number = {1},
1844 year = {2003},
1845 pages = {179--214},
1846 abstract = {We recently conducted a research project for a large North American automobile insurer. This study was the most exhaustive ever undertaken by this particular insurer and lasted over an entire year. We analyzed the discriminating power of each variable used for ratemaking. We analyzed the performance of several models within five broad categories: linear regressions, generalized linear models, decision trees, neural networks and support vector machines. In this paper, we present the main results of this study. We qualitatively compare models and show how neural networks can represent high-order nonlinear dependencies with a small number of parameters, each of which is estimated on a large proportion of the data, thus yielding low variance. We thoroughly explain the purpose of the nonlinear sigmoidal transforms which are at the very heart of neural networks' performances. The main numerical result is a statistically significant reduction in the out-of-sample mean-squared error using the neural network model and our ability to substantially reduce the median premium by charging more to the highest risks. This in turn can translate into substantial savings and financial benefits for an insurer. We hope this paper goes a long way towards convincing actuaries to include neural networks within their set of modeling tools for ratemaking.},
1847 topics={Finance,Mining},cat={J},
1848 }
1849
1850 @INPROCEEDINGS{eck+bertinmahieux+lamere+green:nips2007,
1851 author = {Eck, Douglas and Lamere, Paul and Bertin-Mahieux, Thierry and Green, Stephen},
1852 editor = {Platt, John and Kolen, J. and Singer, Yoram and Roweis, S.},
1853 title = {Automatic Generation of Social Tags for Music Recommendation},
1854 year = {2008},
1855 crossref = {NIPS20-shorter},
1856 source = "OwnPublication"
1857 }
1858
1859 @INPROCEEDINGS{eck+bertinmahieux+lamere:ismir2007,
1860 author = {Eck, Douglas and Bertin-Mahieux, Thierry and Lamere, Paul},
1861 title = {Autotagging music using supervised machine learning},
1862 booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
1863 year = {2007},
1864 source={OwnPublication},
1865 }
1866
1867 @INPROCEEDINGS{eck+casagrande:ismir2005,
1868 author = {Eck, Douglas and Casagrande, Norman},
1869 title = {Finding Meter in Music Using an Autocorrelation Phase Matrix and Shannon Entropy},
1870 booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
1871 year = {2005},
1872 pages = {504--509},
1873 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir.pdf},
1874 source={OwnPublication},
1875 sourcetype={Conference},
1876 }
1877
1878 @INCOLLECTION{eck+gasser+port:2000,
1879 author = {Eck, Douglas and Gasser, M. and Port, Robert},
1880 editor = {Desain, P. and Windsor, L.},
1881 title = {Dynamics and Embodiment in Beat Induction},
1882 booktitle = {{Rhythm Perception and Production}},
1883 year = {2000},
1884 pages = {157--170},
1885 publisher = {Swets and Zeitlinger},
1886 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2000_rppw.pdf},
1887 abstract = {We provide an argument for using dynamical systems theory in the domain of beat induction. We motivate the study of beat induction and to relate beat induction to the more general study of human rhythm cognition. In doing so we compare a dynamical, embodied approach to a symbolic (traditional AI) one, paying particular attention to how the modeling approach brings with it tacit assumptions about what is being modeled. Please note that this is a philosophy paper about research that was, at the time of writing, very much in progress.},
1888 source={OwnPublication},
1889 sourcetype={Chapter},
1890 }
1891
1892 @INPROCEEDINGS{eck+gasser:1996,
1893 author = {Eck, Douglas and Gasser, M.},
1894 editor = {},
1895 title = {Perception of Simple Rhythmic Patterns in a Network of Oscillators},
1896 booktitle = {{The Proceedings of the Eighteenth Annual Conference of the Cognitive Science Society}},
1897 year = {1996},
1898 publisher = {Lawrence Erlbaum Associates},
1899 abstract = {This paper is concerned with the complex capacity to recognize and reproduce rhythmic patterns. While this capacity has not been well investigated, in broad qualitative terms it is clear that people can learn to identify and produce recurring patterns defined in terms of sequences of beats of varying intensity and rests: the rhythms behind waltzes, reels, sambas, etc. Our short term goal is a model which is "hard-wired" with knowledge of a set of such patterns. Presented with a portion of one of the patterns or a label for a pattern, the model should reproduce the pattern and continue to do so when the input is turned off. Our long-term goal is a model which can learn to adjust the connection strengths which implement particular patterns as it is exposed to input patterns.},
1900 source={OwnPublication},
1901 sourcetype={Conference},
1902 }
1903
1904 @TECHREPORT{eck+graves+schmidhuber:tr-speech2003,
1905 author = {Eck, Douglas and Graves, A. and Schmidhuber, Juergen},
1906 title = {A New Approach to Continuous Speech Recognition Using {LSTM} Recurrent Neural Networks},
1907 number = {IDSIA-14-03},
1908 year = {2003},
1909 institution = {IDSIA},
1910 abstract = {This paper presents an algorithm for continuous speech recognition built from two Long Short-Term Memory ({LSTM}) recurrent neural networks. A first {LSTM} network performs frame-level phone probability estimation. A second network maps these phone predictions onto words. In contrast to {HMM}s, this allows greater exploitation of long-timescale correlations. Simulation results are presented for a hand-segmented subset of the "Numbers-95" database. These results include isolated phone prediction, continuous frame-level phone prediction and continuous word prediction. We conclude that despite its early stage of development, our new model is already competitive with existing approaches on certain aspects of speech recognition and promising on others, warranting further research.},
1911 source={OwnPublication},
1912 sourcetype={TechReport},
1913 }
1914
1915 @TECHREPORT{eck+lapalme:2008,
1916 author = {Eck, Douglas and Lapalme, J.},
1917 title = {Learning Musical Structure Directly from Sequences of Music},
1918 number = {1300},
1919 year = {2008},
1920 institution = {Universit{\'{e}} de Montr{\'{e}}al DIRO},
1921 url = {http://www.iro.umontreal.ca/~eckdoug/papers/tr1300.pdf},
1922 source={OwnPublication},
1923 sourcetype={TechReport},
1924 }
1925
1926 @INPROCEEDINGS{eck+schmidhuber:icann2002,
1927 author = {Eck, Douglas and Schmidhuber, Juergen},
1928 editor = {Dorronsoro, J.},
1929 title = {Learning The Long-Term Structure of the Blues},
1930 booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
1931 volume = {},
1932 year = {2002},
1933 pages = {284--289},
1934 publisher = {Springer},
1935 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_icannMusic.pdf},
1936 abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, they have been unable to learn an entire musical form and use that knowledge to guide composition. In this study, we describe model details and present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and some listeners believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
1937 source={OwnPublication},
1938 sourcetype={Conference},
1939 }
1940
1941 @INPROCEEDINGS{eck+schmidhuber:ieee2002,
1942 author = {Eck, Douglas and Schmidhuber, Juergen},
1943 editor = {Bourlard, H.},
1944 title = {Finding Temporal Structure in Music: Blues Improvisation with {LSTM} Recurrent Networks},
1945 booktitle = {Neural Networks for Signal Processing XII, Proceedings of the 2002 IEEE Workshop},
1946 year = {2002},
1947 pages = {747--756},
1948 publisher = {IEEE},
1949 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_ieee.pdf},
1950 abstract = {Few types of signal streams are as ubiquitous as music. Here we consider the problem of extracting essential ingredients of music signals, such as well-defined global temporal structure in the form of nested periodicities (or {\em meter}). Can we construct an adaptive signal processing device that learns by example how to generate new instances of a given musical style? Because recurrent neural networks can in principle learn the temporal structure of a signal, they are good candidates for such a task. Unfortunately, music composed by standard recurrent neural networks ({RNN}s) often lacks global coherence. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and learning of context sensitive languages. In the current study we show that {LSTM} is also a good mechanism for learning to compose music. We present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and we believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
1951 source={OwnPublication},
1952 sourcetype={Conference},
1953 }
1954
1955 @ARTICLE{eck+scott:2005,
1956 author = {Eck, Douglas and Scott, S. K.},
1957 title = {Editorial: New Research in Rhythm Perception and Production},
1958 journal = {Music Perception},
1959 volume = {22},
1960 number = {3},
1961 year = {2005},
1962 pages = {371-388},
1963 source={OwnPublication},
1964 sourcetype={Other},
1965 }
1966
1967 @MISC{eck+scott:editor2005,
1968 author = {Eck, Douglas and Scott, S. K.},
1969 title = {Music Perception},
1970 year = {2005},
1971 note = {Guest Editor, Special Issue on Rhythm Perception and Production, 22(3)},
1972 source={OwnPublication},
1973 sourcetype={Other},
1974 }
1975
1976 @INPROCEEDINGS{eck:1999,
1977 author = {Eck, Douglas},
1978 editor = {},
1979 title = {Learning Simple Metrical Preferences in a Network of {F}itzhugh-{N}agumo Oscillators},
1980 booktitle = {{The Proceedings of the Twenty-First Annual Conference of the Cognitive Science Society}},
1981 year = {1999},
1982 publisher = {Lawrence Erlbaum Associates},
1983 abstract = {Hebbian learning is used to train a network of oscillators to prefer periodic signals of pulses over aperiodic signals. Target signals consisted of metronome-like voltage pulses with varying amounts of inter-onset noise injected. (with 0\% noise yielding a periodic signal and more noise yielding more and more aperiodic signals.) The oscillators---piecewise-linear approximations (Abbott, 1990) to Fitzhugh-Nagumo oscillators---are trained using mean phase coherence as an objective function. Before training a network is shown to readily synchronize with signals having wide range of noise. After training on a series of noise-free signals, a network is shown to only synchronize with signals having little or no noise. This represents a bias towards periodicity and is explained by strong positive coupling connections between oscillators having harmonically-related periods.},
1984 source={OwnPublication},
1985 sourcetype={Conference},
1986 }
1987
1988 @UNPUBLISHED{eck:bramsworkshop2004,
1989 author = {Eck, Douglas},
1990 title = {Challenges for Machine Learning in the Domain of Music},
1991 year = {2004},
1992 note = {BRAMS Workshop on Brain and Music, Montreal Neurological Institute},
1993 abstract = {Slides and musical examples available on request.},
1994 source={OwnPublication},
1995 sourcetype={Workshop},
1996 optkey={""},
1997 optmonth={""},
1998 optannote={""},
1999 }
2000
2001 @PHDTHESIS{eck:diss,
2002 author = {Eck, Douglas},
2003 title = {{Meter Through Synchrony: Processing Rhythmical Patterns with Relaxation Oscillators}},
2004 year = {2000},
2005 school = {Indiana University, Bloomington, IN, www.idsia.ch/\-\~{}doug/\-publications.html},
2006 abstract = {This dissertation uses a network of relaxation oscillators to beat along with temporal signals. Relaxation oscillators exhibit interspersed slow-fast movement and model a wide array of biological oscillations. The model is built up gradually: first a single relaxation oscillator is exposed to rhythms and shown to be good at finding downbeats in them. Then large networks of oscillators are mutually coupled in an exploration of their internal synchronization behavior. It is demonstrated that appropriate weights on coupling connections cause a network to form multiple pools of oscillators having stable phase relationships. This is a promising first step towards networks that can recreate a rhythmical pattern from memory. In the full model, a coupled network of relaxation oscillators is exposed to rhythmical patterns. It is shown that the network finds downbeats in patterns while continuing to exhibit good internal stability. A novel non-dynamical model of downbeat induction called the Normalized Positive (NP) clock model is proposed, analyzed, and used to generate comparison predictions for the oscillator model. The oscillator model compares favorably to other dynamical approaches to beat induction such as adaptive oscillators. However, the relaxation oscillator model takes advantage of intrinsic synchronization stability to allow the creation of large coupled networks. This research lays the groundwork for a long-term research goal, a robotic arm that responds to rhythmical signals by tapping along. It also opens the door to future work in connectionist learning of long rhythmical patterns.},
2007 source={OwnPublication},
2008 sourcetype={Thesis},
2009 }
2010
2011 @INPROCEEDINGS{eck:icann2001,
2012 author = {Eck, Douglas},
2013 editor = {Dorffner, Georg},
2014 title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
2015 booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
2016 volume = {},
2017 year = {2001},
2018 pages = {1239--1247},
2019 publisher = {Springer},
2020 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_icann.pdf},
2021 abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators.},
2022 source={OwnPublication},
2023 sourcetype={Conference},
2024 }
2025
2026 @INPROCEEDINGS{eck:icassp2007,
2027 author = {Eck, Douglas},
2028 editor = {},
2029 title = {Beat Tracking Using an Autocorrelation Phase Matrix},
2030 booktitle = {{Proceedings of the 2007 International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
2031 year = {2007},
2032 pages = {1313--1316},
2033 publisher = {IEEE Signal Processing Society},
2034 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_icassp.pdf},
2035 source={OwnPublication},
2036 sourcetype={Conference},
2037 }
2038
2039 @INPROCEEDINGS{eck:icmpc2004,
2040 author = {Eck, Douglas},
2041 editor = {Lipscomb, S. D. and Ashley, R. and Gjerdingen, R. O. and Webster, P.},
2042 title = {A Machine-Learning Approach to Musical Sequence Induction That Uses Autocorrelation to Bridge Long Timelags},
2043 booktitle = {{The Proceedings of the Eighth International Conference on Music Perception and Cognition ({ICMPC}8)}},
2044 year = {2004},
2045 pages = {542-543},
2046 publisher = {Causal Productions},
2047 abstract = {One major challenge in using statistical sequence learning methods in the domain of music lies in bridging the long timelags that separate important musical events. Consider, for example, the chord changes that convey the basic structure of a pop song. A sequence learner that cannot predict chord changes will almost certainly not be able to generate new examples in a musical style or to categorize songs by style. Yet, it is surprisingly difficult for a sequence learner to bridge the long timelags necessary to identify when a chord change will occur and what its new value will be. This is the case because chord changes can be separated by dozens or hundreds of intervening notes. One could solve this problem by treating chords as being special (as did Mozer, NIPS 1991). But this is impractical---it requires chords to be labeled specially in the dataset, limiting the applicability of the model to non-labeled examples---and furthermore does not address the general issue of nested temporal structure in music. I will briefly describe this temporal structure (known commonly as "meter") and present a model that uses to its advantage an assumption that sequences are metrical. The model consists of an autocorrelation-based filtration that estimates online the most likely metrical tree (i.e. the frequency and phase of beat, measure, phrase &etc.) and uses that to generate a series of sequences varying at different rates. These sequences correspond to each level in the hierarchy. Multiple learners can be used to treat each series separately and their predictions can be combined to perform composition and categorization. I will present preliminary results that demonstrate the usefulness of this approach. Time permitting I will also compare the model to alternate approaches.},
2048 source={OwnPublication},
2049 sourcetype={Conference},
2050 }
2051
2052 @INPROCEEDINGS{eck:icmpc2006,
2053 author = {Eck, Douglas},
2054 editor = {Baroni, M. and Addessi, A. R. and Caterina, R. and Costa, M.},
2055 title = {Beat Induction Using an Autocorrelation Phase Matrix},
2056 booktitle = {The Proceedings of the 9th International Conference on Music Perception and Cognition ({ICMPC9})},
2057 year = {2006},
2058 pages = {931-932},
2059 publisher = {Causal Productions},
2060 source={OwnPublication},
2061 sourcetype={Conference},
2062 }
2063
2064 @UNPUBLISHED{eck:irisworkshop2004,
2065 author = {Eck, Douglas},
2066 title = {Using Autocorrelation to Bridge Long Timelags when Learning Sequences of Music},
2067 year = {2004},
2068 note = {IRIS 2004 Machine Learning Workshop, Ottawa, Canada},
2069 abstract = {Slides and musical examples available on request.},
2070 source={OwnPublication},
2071 sourcetype={Workshop},
2072 optkey={""},
2073 optmonth={""},
2074 optannote={""},
2075 }
2076
2077 @ARTICLE{eck:jnmr2001,
2078 author = {Eck, Douglas},
2079 title = {A Positive-Evidence Model for Rhythmical Beat Induction},
2080 journal = {Journal of New Music Research},
2081 volume = {30},
2082 number = {2},
2083 year = {2001},
2084 pages = {187--200},
2085 abstract = {The Normalized Positive (NPOS) model is a rule-based model that predicts downbeat location and pattern complexity in rhythmical patterns. Though derived from several existing models, the NPOS model is particularly effective at making correct predictions while at the same time having low complexity. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on beat induction.},
2086 source={OwnPublication},
2087 sourcetype={Journal},
2088 }
2089
2090 @UNPUBLISHED{eck:mipsworkshop2004,
2091 author = {Eck, Douglas},
2092 title = {Bridging Long Timelags in Music},
2093 year = {2004},
2094 note = {NIPS 2004 Workshop on Music and Machine Learning (MIPS), Whistler, British Columbia},
2095 abstract = {Slides and musical examples available on request.},
2096 source={OwnPublication},
2097 sourcetype={Workshop},
2098 optkey={""},
2099 optmonth={""},
2100 optannote={""},
2101 }
2102
2103 @ARTICLE{eck:mp2006,
2104 author = {Eck, Douglas},
2105 title = {Finding Long-Timescale Musical Structure with an Autocorrelation Phase Matrix},
2106 journal = {Music Perception},
2107 volume = {24},
2108 number = {2},
2109 year = {2006},
2110 pages = {167--176},
2111 source={OwnPublication},
2112 sourcetype={Journal},
2113 }
2114
2115 @UNPUBLISHED{eck:nipsworkshop2003,
2116 author = {Eck, Douglas},
2117 title = {Time-warped hierarchical structure in music and speech: A sequence prediction challenge},
2118 year = {2003},
2119 note = {NIPS 2003 Workshop on Recurrent Neural Networks, Whistler, British Columbia},
2120 abstract = {Slides and musical examples available on request.},
2121 source={OwnPublication},
2122 sourcetype={Workshop},
2123 optkey={""},
2124 optmonth={""},
2125 optannote={""},
2126 }
2127
2128 @UNPUBLISHED{eck:nipsworkshop2006,
2129 author = {Eck, Douglas},
2130 title = {Generating music sequences with an echo state network},
2131 year = {2006},
2132 note = {NIPS 2006 Workshop on Echo State Networks and Liquid State Machines},
2133 abstract = {Slides and musical examples available on request.},
2134 source={OwnPublication},
2135 sourcetype={Workshop},
2136 optkey={""},
2137 optmonth={""},
2138 optannote={""},
2139 }
2140
2141 @UNPUBLISHED{eck:nipsworkshop2007,
2142 author = {Eck, Douglas},
2143 title = {Measuring and modeling musical expression},
2144 year = {2007},
2145 note = {NIPS 2007 Workshop on Music, Brain and Cognition},
2146 source={OwnPublication},
2147 sourcetype={Workshop},
2148 optkey={""},
2149 optmonth={""},
2150 optannote={""},
2151 }
2152
2153 @ARTICLE{eck:psyres2002,
2154 author = {Eck, Douglas},
2155 title = {Finding Downbeats with a Relaxation Oscillator},
2156 journal = {Psychol. Research},
2157 volume = {66},
2158 number = {1},
2159 year = {2002},
2160 pages = {18--25},
2161 abstract = {A relaxation oscillator model of neural spiking dynamics is applied to the task of finding downbeats in rhythmical patterns. The importance of downbeat discovery or {\em beat induction} is discussed, and the relaxation oscillator model is compared to other oscillator models. In a set of computer simulations the model is tested on 35 rhythmical patterns from Povel \& Essens (1985). The model performs well, making good predictions in 34 of 35 cases. In an analysis we identify some shortcomings of the model and relate model behavior to dynamical properties of relaxation oscillators.},
2162 source={OwnPublication},
2163 sourcetype={Journal},
2164 }
2165
2166 @UNPUBLISHED{eck:rppw2005,
2167 author = {Eck, Douglas},
2168 title = {Meter and Autocorrelation},
2169 year = {2005},
2170 note = {{10th Rhythm Perception and Production Workshop (RPPW), Alden Biesen, Belgium}},
2171 source={OwnPublication},
2172 sourcetype={Workshop},
2173 }
2174
2175 @TECHREPORT{eck:tr-music2002,
2176 author = {Eck, Douglas and Schmidhuber, Juergen},
2177 title = {A First Look at Music Composition using {LSTM} Recurrent Neural Networks},
2178 number = {IDSIA-07-02},
2179 year = {2002},
2180 institution = {IDSIA},
2181 abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, attempts at learning an entire musical form and using that knowledge to guide composition have been unsuccessful. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and CSL learning. In the current study I show that {LSTM} is also a good mechanism for learning to compose music. I compare this approach to previous attempts, with particular focus on issues of data representation. I present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and I believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen. {\em Note: This is a more complete version of the 2002 ICANN submission Learning the Long-Term Structure of the Blues.}},
2182 source={OwnPublication},
2183 sourcetype={TechReport},
2184 }
2185
2186 @TECHREPORT{eck:tr-npos2000,
2187 author = {Eck, Douglas},
2188 title = {A Positive-Evidence Model for Classifying Rhythmical Patterns},
2189 number = {IDSIA-09-00},
2190 year = {2000},
2191 institution = {IDSIA},
2192 abstract = {The Normalized Positive (NPOS) model is a novel matching model that predicts downbeat location and pattern complexity in rhythmical patterns. Though similar models report success, the NPOS model is particularly effective at making these predictions while at the same time being theoretically and mathematically simple. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on rhythm perception.\\ {\em Note: See the 2001 Journal of New Music Research paper "A Positive-Evidence Model for Rhythmical Beat Induction" for a newer version of this paper.}},
2193 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-09-00.ps.gz},
2194 source={OwnPublication},
2195 sourcetype={TechReport},
2196 }
2197
2198 @TECHREPORT{eck:tr-oscnet2001,
2199 author = {Eck, Douglas},
2200 title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
2201 number = {IDSIA-06-01},
2202 year = {2001},
2203 institution = {IDSIA},
2204 abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators. \\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
2205 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-06-01.ps.gz},
2206 source={OwnPublication},
2207 sourcetype={TechReport},
2208 }
2209
2210 @TECHREPORT{eck:tr-tracking2000,
2211 author = {Eck, Douglas},
2212 title = {Tracking Rhythms with a Relaxation Oscillator},
2213 number = {IDSIA-10-00},
2214 year = {2000},
2215 institution = {IDSIA},
2216 abstract = {A number of biological and mechanical processes are typified by a continued slow accrual and fast release of energy. A nonlinear oscillator exhibiting this slow-fast behavior is called a relaxation oscillator and is used to model, for example, human heartbeat pacemaking and neural action potential. Similar limit cycle oscillators are used to model a wider range of behaviors including predator-prey relationships and synchrony in animal populations such as fireflies. Though nonlinear limit-cycle oscillators have been successfully applied to beat induction, relaxation oscillators have received less attention. In this work we offer a novel and effective relaxation oscillator model of beat induction. We outline the model in detail and provide a perturbation analysis of its response to external stimuli. In a series of simulations we expose the model to patterns from Experiment 1 of Povel \& Essens (1985). We then examine the beat assignments of the model. Although the overall performance of the model is very good, there are shortcomings. We believe that a network of mutually-coupled oscillators will address many of these shortcomings, and we suggest an appropriate course for future research.\\ {\em Note: See the 2001 {\em Psychological Research} article "Finding Downbeats with a Relaxation Oscillator" for a revised but less detailed version of this paper.}},
2217 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-10-00.ps.gz},
2218 source={OwnPublication},
2219 sourcetype={TechReport},
2220 }
2221
2222 @TECHREPORT{eck:tr-tracking2002,
2223 author = {Eck, Douglas},
2224 title = {Real-Time Musical Beat Induction with Spiking Neural Networks},
2225 number = {IDSIA-22-02},
2226 year = {2002},
2227 institution = {IDSIA},
2228 abstract = {Beat induction is best described by analogy to the activities of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to perform automatically. We will present a model of beat induction that uses a spiking neural network as the underlying synchronization mechanism. This approach has some advantages over existing methods; it runs online, responds at many levels in the metrical hierarchy, and produces good results on performed music (Beatles piano performances encoded as MIDI). In this paper the model is described in some detail and simulation results are discussed.},
2229 source={OwnPublication},
2230 sourcetype={TechReport},
2231 }
2232
2233 @UNPUBLISHED{eck:verita2002,
2234 author = {Eck, Douglas},
2235 title = {Real Time Beat Induction with Spiking Neurons},
2236 year = {2002},
2237 note = {{Music, Motor Control and the Mind: Symposium at Monte Verita, May}},
2238 abstract = {Beat induction is best described by analogy to the activites of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to model. I will discuss an approach to beat induction that uses a network of spiking neurons to synchronize with periodic components in a signal at many timescales. Through a competitive process, groups of oscillators embodying a particular metrical interpretation (e.g. \"4/4\") are selected from the network and used to track the pattern. I will compare this model to other approaches including a traditional symbolic AI system (Dixon 2001), and one based on Bayesian statistics (Cemgil et al, 2001). Finally I will present performance results of the network on a set of MIDI-recorded piano performances of Beatles songs collected by the Music, Mind, Machine Group, NICI, University of Nijmegen (see Cemgil et al, 2001 for more details or http://www.nici.kun.nl/mmm).},
2239 source={OwnPublication},
2240 sourcetype={Workshop},
2241 }
2242
2243 @INPROCEEDINGS{ElHihi+Bengio-nips8,
2244 author = {El Hihi, Salah and Bengio, Yoshua},
2245 title = {Hierarchical Recurrent Neural Networks for Long-Term Dependencies},
2246 year = {1996},
2247 crossref = {NIPS8-shorter},
2248 abstract = {We have already shown that extracting lone-term dependencies from sequential data is difficult, both for deterministic dynamical systems such as recurrent networks, and probabilistic models such as hidden {Markov} models ({HMM}s) or input/output hidden {Markov} models ({IOHMM}s). In practice, to avoid this problem, researchers have used domain specific a-priori knowledge to give meaning to the hidden or state variables representing past context. In this paper we propose to use a more general type of a-priori knowledge, namely that the temporal dependencies are structured hierarchically. This implies that long-term dependencies are represented by variables with a long time scale. This principle is applied to a recurrent network which includes delays and multiple time scales. Experiments confirm the advantages of such structures. A similar approach is proposed for {HMM}s and {IOHMM}s.},
2249 topics={LongTerm},cat={C},
2250 }
2251
2252 @ARTICLE{Erhan+al-2010,
2253 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
2254 title = {Why Does Unsupervised Pre-training Help Deep Learning?},
2255 volume = {11},
2256 year = {2010},
2257 pages = {625--660},
2258 journal = {Journal of Machine Learning Research},
2259 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
2260 }
2261
2262 @INPROCEEDINGS{Erhan-aistats-2010,
2263 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
2264 title = {Why Does Unsupervised Pre-training Help Deep Learning?},
2265 booktitle = {Proceedings of AISTATS 2010},
2266 volume = {9},
2267 year = {2010},
2268 pages = {201-208},
2269 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants with impressive results being obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks often involve an unsupervised learning component, usually in an unsupervised pre-training phase. The main question investigated here is the following: why does unsupervised pre-training work so well? Through extensive experimentation, we explore several possible explanations discussed in the literature including its action as a regularizer (Erhan et al. 2009) and as an aid to optimization (Bengio et al. 2007). Our results build on the work of Erhan et al. 2009, showing that unsupervised pre-training appears to play predominantly a regularization role in subsequent supervised training. However our results in an online setting, with a virtually unlimited data stream, point to a somewhat more nuanced interpretation of the roles of optimization and regularization in the unsupervised pre-training effect.}
2270 }
2271
2272 @MASTERSTHESIS{Erhan-MSc,
2273 author = {Erhan, Dumitru},
2274 keywords = {Apprentisage multit{\^{a}}che, Filtrage collaboratif, M{\'{e}}thodes {\`{a}} noyaux, QSAR, R{\'{e}}seaux de neurones},
2275 title = {Collaborative filtering techniques for drug discovery},
2276 year = {2006},
2277 school = {Universit{\'{e}} de Montr{\'{e}}al},
2278 abstract = {Cette th{\`{e}}se examine le probl{\`{e}}me d'apprendre plusieurs t{\^{a}}ches simultan{\'{e}}ment,
2279 afin de transf{\'{e}}rer les connaissances apprises {\`{a}} une nouvelle t{\^{a}}che. Si
2280 on suppose que les t{\^{a}}ches partagent une repr{\'{e}}sentation et qu'il est possible de
2281 d{\'{e}}couvrir cette repr{\'{e}}sentation efficacement, cela peut nous servir {\`{a}} construire un
2282 meilleur mod{\`{e}}le de la nouvelle t{\^{a}}che. Il existe plusieurs variantes de
2283 cette m{\'{e}}thode: transfert inductif, apprentisage multit{\^{a}}che, filtrage
2284 collaboratif etc. Nous avons {\'{e}}valu{\'{e}} plusieurs algorithmes d'apprentisage
2285 supervis{\'{e}} pour d{\'{e}}couvrir des repr{\'{e}}sentations partag{\'{e}}es parmi les
2286 t{\^{a}}ches d{\'{e}}finies dans un probl{\`{e}}me de chimie computationelle. Nous avons
2287 formul{\'{e}} le probl{\`{e}}me dans un cadre d'apprentisage automatique,
2288 fait l'analogie avec les algorithmes standards de filtrage collaboratif et construit les
2289 hypoth{\`{e}}ses g{\'{e}}n{\'{e}}rales qui devraient {\^{e}}tre test{\'{e}}es pour valider l'utilitisation des
2290 algorithmes multit{\^{a}}che. Nous avons aussi {\'{e}}valu{\'{e}} la performance des algorithmes
2291 d'apprentisage utilis{\'{e}}s et d{\'{e}}montrons qu'il est, en effet, possible de trouver une
2292 repr{\'{e}}sentation partag{\'{e}}e pour le probl{\`{e}}me consider{\'{e}}. Du point de vue
2293 th{\'{e}}orique, notre apport est une modification d'un algorithme
2294 standard---les machines {\`{a}} vecteurs de support--qui produit des r{\'{e}}sultats
2295 comparables aux meilleurs algorithmes disponsibles et qui utilise {\`{a}} fond les
2296 concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre
2297 apport est l'utilisation de notre algorithme par les compagnies
2298 pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.}
2299 }
2300
2301 @INPROCEEDINGS{Erhan2009,
2302 author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
2303 keywords = {Deep Networks},
2304 title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training},
2305 year = {2009},
2306 pages = {153--160},
2307 crossref = {xAISTATS2009-shorter},
2308 abstract = {Whereas theoretical work suggests that deep architectures might be more efficient at representing highly-varying functions, training deep architectures was unsuccessful until the recent advent of algorithms based on unsupervised pretraining. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. Answering these questions is important if learning in deep architectures is to be further improved. We attempt to shed some light on these questions through extensive simulations. The experiments confirm and clarify the advantage of unsupervised pre-training. They demonstrate the robustness of the training procedure with respect to the random initialization, the positive effect of pre-training in terms of optimization and its role as a regularizer. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples.}
2309 }
2310
2311 @ARTICLE{gasser+eck+port:1999,
2312 author = {Gasser, M. and Eck, Douglas and Port, Robert},
2313 title = {Meter as Mechanism: A Neural Network Model that Learns Metrical patterns},
2314 journal = {Connection Science},
2315 volume = {11},
2316 number = {2},
2317 year = {1999},
2318 pages = {187--216},
2319 abstract = {One kind of prosodic structure that apparently underlies both music and some examples of speech production is meter. Yet detailed measurements of the timing of both music and speech show that the nested periodicities that define metrical structure can be quite noisy in time. What kind of system could produce or perceive such variable metrical timing patterns? And what would it take to be able to store and reproduce particular metrical patterns from long-term memory? We have developed a network of coupled oscillators that both produces and perceives patterns of pulses that conform to particular meters. In addition, beginning with an initial state with no biases, it can learn to prefer the particular meter that it has been previously exposed to.},
2320 own={Have},
2321 source={OwnPublication},
2322 sourcetype={Journal},
2323 }
2324
2325 @TECHREPORT{gasser+eck+port:tr-1996,
2326 author = {Gasser, M. and Eck, Douglas and Port, Robert},
2327 title = {Meter as Mechanism A Neural Network that Learns Metrical Patterns},
2328 number = {180},
2329 year = {1996},
2330 institution = {Indiana University Cognitive Science Program},
2331 source={OwnPublication},
2332 sourcetype={TechReport},
2333 }
2334
2335 @INPROCEEDINGS{gasser+eck:1996,
2336 author = {Gasser, M. and Eck, Douglas},
2337 editor = {},
2338 title = {Representing Rhythmic Patterns in a Network of Oscillators},
2339 booktitle = {{The Proceedings of the International Conference on Music Perception and Cognition}},
2340 number = {4},
2341 year = {1996},
2342 pages = {361--366},
2343 publisher = {Lawrence Erlbaum Associates},
2344 url = {http://www.iro.umontreal.ca/~eckdoug/papers/1996_gasser_icmpc.pdf},
2345 abstract = {This paper describes an evolving computational model of the perception and pro-duction of simple rhythmic patterns. The model consists of a network of oscillators of different resting frequencies which couple with input patterns and with each other. Os-cillators whose frequencies match periodicities in the input tend to become activated. Metrical structure is represented explicitly in the network in the form of clusters of os-cillators whose frequencies and phase angles are constrained to maintain the harmonic relationships that characterize meter. Rests in rhythmic patterns are represented by ex-plicit rest oscillators in the network, which become activated when an expected beat in the pattern fails to appear. The model makes predictions about the relative difficulty of patterns and the effect of deviations from periodicity in the input.},
2346 source={OwnPublication},
2347 sourcetype={Conference},
2348 }
2349
2350 @INPROCEEDINGS{gers+eck+schmidhuber:icann2001,
2351 author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
2352 editor = {Dorffner, Georg},
2353 title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
2354 booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
2355 year = {2001},
2356 pages = {669--676},
2357 publisher = {Springer},
2358 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_gers_icann.pdf},
2359 abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.},
2360 source={OwnPublication},
2361 sourcetype={Conference},
2362 }
2363
2364 @TECHREPORT{gers+eck+schmidhuber:tr-2000,
2365 author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
2366 title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
2367 number = {IDSIA-22-00},
2368 year = {2000},
2369 institution = {IDSIA},
2370 abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.\\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
2371 ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-22-00.ps.gz},
2372 source={OwnPublication},
2373 sourcetype={TechReport},
2374 }
2375
2376 @INPROCEEDINGS{gers+perez+eck+schmidhuber:esann2002,
2377 author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
2378 title = {{DEKF-LSTM}},
2379 booktitle = {Proceedings of the 10th European Symposium on Artificial Neural Networks, ESANN 2002},
2380 year = {2002},
2381 source={OwnPublication},
2382 sourcetype={Conference},
2383 }
2384
2385 @INPROCEEDINGS{gers+perez+eck+schmidhuber:icannA2002,
2386 author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
2387 editor = {Dorronsoro, J.},
2388 title = {Learning Context Sensitive Languages with {LSTM} Trained with {Kalman} Filters},
2389 booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
2390 year = {2002},
2391 pages = {655--660},
2392 publisher = {Springer},
2393 abstract = {Unlike traditional recurrent neural networks, the Long Short-Term Memory ({LSTM}) model generalizes well when presented with training sequences derived from regular and also simple nonregular languages. Our novel combination of {LSTM} and the decoupled extended Kalman filter, however, learns even faster and generalizes even better, requiring only the 10 shortest exemplars n <= 10 of the context sensitive language a^nb^nc^n to deal correctly with values of n up to 1000 and more. Even when we consider the relatively high update complexity per timestep, in many cases the hybrid offers faster learning than {LSTM} by itself.},
2394 source={OwnPublication},
2395 sourcetype={Conference},
2396 }
2397
2398 @PHDTHESIS{Ghosn-Phd-2003,
2399 author = {Ghosn, Joumana},
2400 title = {Apprentissage multi-t{\^{a}}ches et partage de connaissances},
2401 year = {2003},
2402 school = {Universit{\'{e}} de Montr{\'{e}}al}
2403 }
2404
2405 @INPROCEEDINGS{ghosn97,
2406 author = {Ghosn, Joumana and Bengio, Yoshua},
2407 title = {Multi-Task Learning for Stock Selection},
2408 year = {1997},
2409 pages = {946--952},
2410 publisher = {MIT Press, Cambridge, MA},
2411 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/multitask-nips97.pdf},
2412 crossref = {NIPS9},
2413 abstract = {Artificial Neural Networks can be used to predict future returns of stocks in order to take financial decisions. Should one build a separate network for each stock or share the same network for all the stocks. In this paper we also explore other alternatives, in which some layers are shared and others are not shared. When the prediction of future returns for different stocks are viewed as different tasks, sharing some parameters across stocks is a form of multi-task learning. In a series of experiments with Canadian stocks, we obtain yearly returns that are more than 14\% above various benchmarks.},
2414 topics={MultiTask,Finance},cat={C},
2415 }
2416
2417 @TECHREPORT{Gingras-asynchronous-TR96,
2418 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
2419 title = {Handling asynchronous or missing financial data with recurrent networks},
2420 number = {1020},
2421 year = {1996},
2422 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
2423 topics={Finance,Missing},cat={T},
2424 }
2425
2426 @TECHREPORT{Gingras-financial-TR99,
2427 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
2428 title = {On Out-of-Sample Statistics for Financial Time-Series},
2429 number = {2585},
2430 year = {1999},
2431 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
2432 topics={Comparative,Finance},cat={T},
2433 }
2434
2435 @INPROCEEDINGS{gingras2000,
2436 author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
2437 title = {On Out-of-Sample Statistics for Time-Series},
2438 booktitle = {Computational Finance 2000},
2439 year = {2000},
2440 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/out-err-cf2000.pdf},
2441 abstract = {This paper studies an out-of-sample statistic for time-series prediction that is analogous to the widely used R2 in-sample statistic. We propose and study methods to estimate the variance of this out-of-sample statistic. We suggest that the out-of-sample statistic is more robust to distributional and asymptotic assumptions behind many tests for in-sample statistics. Furthermore we argue that it may be more important in some cases to choose a model that generalizes as well as possible rather than choose the parameters that are closest to the true parameters. Comparative experiments are performed on a financial time-series (daily and monthly returns of the TSE300 index). The experiments are performed or varying prediction horizons and we study the relation between predictibility (out-of-sample R2), variability of the out-of-sample R2 statistic, and the prediction horizon.},
2442 topics={Comparative,Finance},cat={C},
2443 }
2444
2445 @INPROCEEDINGS{GlorotAISTATS2010,
2446 author = {Bengio, Yoshua and Glorot, Xavier},
2447 title = {Understanding the difficulty of training deep feedforward neural networks},
2448 booktitle = {Proceedings of AISTATS 2010},
2449 volume = {9},
2450 year = {2010},
2451 pages = {249-256},
2452 abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}
2453 }
2454
2455 @INPROCEEDINGS{Gori89,
2456 author = {Gori, Marco and Bengio, Yoshua and De Mori, Renato},
2457 title = {BPS: a learning algorithm for capturing the dynamic nature of speech},
2458 booktitle = {International Joint Conference on Neural Networks},
2459 volume = {2},
2460 year = {1989},
2461 pages = {417--424},
2462 publisher = {IEEE, New York},
2463 topics={Speech},cat={C},
2464 }
2465
2466 @INCOLLECTION{Grandvalet+Bengio-ssl-2006,
2467 author = {Grandvalet, Yves and Bengio, Yoshua},
2468 editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
2469 title = {Entropy Regularization},
2470 booktitle = {Semi-Supervised Learning},
2471 year = {2006},
2472 pages = {151--168},
2473 publisher = {{MIT} Press},
2474 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/entropy_regularization_2006.pdf},
2475 abstract = {The problem of semi-supervised induction consists in learning a decision rule from
2476 labeled and unlabeled data. This task can be undertaken by discriminative methods,
2477 provided that learning criteria are adapted consequently. In this chapter, we motivate the use of entropy regularization as a means to benefit from unlabeled data in
2478 the framework of maximum a posteriori estimation. The learning criterion is derived
2479 from clearly stated assumptions and can be applied to any smoothly parametrized
2480 model of posterior probabilities. The regularization scheme favors low density separation, without any modeling of the density of input features. The contribution
2481 of unlabeled data to the learning criterion induces local optima, but this problem
2482 can be alleviated by deterministic annealing. For well-behaved models of posterior
2483 probabilities, deterministic annealing {EM} provides a decomposition of the learning
2484 problem in a series of concave subproblems. Other approaches to the semi-supervised
2485 problem are shown to be close relatives or limiting cases of entropy regularization.
2486 A series of experiments illustrates the good behavior of the algorithm in terms of
2487 performance and robustness with respect to the violation of the postulated low density separation assumption. The minimum entropy solution benefits from unlabeled
2488 data and is able to challenge mixture models and manifold learning in a number of
2489 situations.},
2490 cat={B},topics={Unsupervised},
2491 }
2492
2493 @INPROCEEDINGS{graves+eck+schmidhuber:bio-adit2004,
2494 author = {Graves, A. and Eck, Douglas and Beringer, N. and Schmidhuber, Juergen},
2495 title = {Biologically Plausible Speech Recognition with {LSTM} Neural Nets},
2496 booktitle = {Proceedings of the First Int'l Workshop on Biologically Inspired Approaches to Advanced Information Technology (Bio-ADIT)},
2497 year = {2004},
2498 pages = {127-136},
2499 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2004_bioadit.pdf},
2500 abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) are local in space and time and closely related to a biological model of memory in the prefrontal cortex. Not only are they more biologically plausible than previous artificial {RNN}s, they also outperformed them on many artificially generated sequential processing tasks. This encouraged us to apply {LSTM} to more realistic problems, such as the recognition of spoken digits. Without any modification of the underlying algorithm, we achieved results comparable to state-of-the-art Hidden {Markov} Model ({HMM}) based recognisers on both the {TIDIGITS} and TI46 speech corpora. We conclude that {LSTM} should be further investigated as a biologically plausible basis for a bottom-up, neural net-based approach to speech recognition.},
2501 source={OwnPublication},
2502 sourcetype={Conference},
2503 }
2504
2505 @TECHREPORT{graves+eck+schmidhuber:tr-digits2003,
2506 author = {Graves, A. and Eck, Douglas and Schmidhuber, Juergen},
2507 title = {Comparing {LSTM} Recurrent Networks and Spiking Recurrent Networks on the Recognition of Spoken Digits},
2508 number = {IDSIA-13-03},
2509 year = {2003},
2510 institution = {IDSIA},
2511 abstract = {One advantage of spiking recurrent neural networks ({SNN}s) is an ability to categorise data using a synchrony-based latching mechnanism. This is particularly useful in problems where timewarping is encountered, such as speech recognition. Differentiable recurrent neural networks ({RNN}s) by contrast fail at tasks involving difficult timewarping, despite having sequence learning capabilities superior to {SNN}s. In this paper we demonstrate that Long Short-Term Memory ({LSTM}) is an {RNN} capable of robustly categorizing timewarped speech data, thus combining the most useful features of both paradigms. We compare its performance to {SNN}s on two variants of a spoken digit identification task, using data from an international competition. The first task (described in Nature (Nadis 2003)) required the categorisation of spoken digits with only a single training exemplar, and was specifically designed to test robustness to timewarping. Here {LSTM} performed better than all the {SNN}s in the competition. The second task was to predict spoken digits using a larger training set. Here {LSTM} greatly outperformed an {SNN}-like model found in the literature. These results suggest that {LSTM} has a place in domains that require the learning of large timewarped datasets, such as automatic speech recognition.},
2512 source={OwnPublication},
2513 sourcetype={TechReport},
2514 }
2515
2516 @INPROCEEDINGS{haffner-98,
2517 author = {Haffner, Patrick and Bottou, {L{\'{e}}on} and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
2518 title = {Browsing through High Quality Document Images with {DjVu}},
2519 booktitle = {Proc. of Advances in Digital Libraries 98},
2520 year = {1998},
2521 pages = {309--318},
2522 publisher = {IEEE},
2523 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/haffner-98.ps.gz},
2524 topics={HighDimensional},cat={C},
2525 }
2526
2527 @INPROCEEDINGS{Hamel+al-2009,
2528 author = {Hamel, Philippe and Wood, Sean and Eck, Douglas},
2529 title = {Automatic Identification of Instrument Classes in Polyphonic and Poly-Instrument Audio},
2530 booktitle = {10th International Society for Music Information Retrieval Conference},
2531 year = {2009},
2532 pages = {399--404},
2533 url = {http://ismir2009.ismir.net/proceedings/PS3-2.pdf},
2534 abstract = {We present and compare several models for automatic identification of instrument classes in polyphonic and poly-instrument audio. The goal is to be able to identify which categories of instrument (Strings, Woodwind, Guitar, Piano, etc.) are present in a given audio example. We use a machine learning approach to solve this task. We constructed a system to generate a large database of musically relevant poly-instrument audio. Our database is generated from hundreds of instruments classified in 7 categories. Musical audio examples are generated by mixing multi-track MIDI files with thousands of instrument combinations. We compare three different classifiers : a Support Vector Machine ({SVM}), a Multilayer Perceptron (MLP) and a Deep Belief Network (DBN). We show that the DBN tends to outperform both the {SVM} and the MLP in most cases.}
2535 }
2536
2537 @MISC{Hugo+al-snowbird-2007,
2538 author = {Larochelle, Hugo and Bengio, Yoshua and Erhan, Dumitru},
2539 title = {Generalization to a zero-data task: an empirical study},
2540 year = {2007},
2541 howpublished = {Talk and poster presented at the Learning Workshop(Snowbird), San Juan, Puerto Rico, 2007}
2542 }
2543
2544 @INPROCEEDINGS{hyper:2000:ijcnn,
2545 author = {Bengio, Yoshua},
2546 title = {Continuous Optimization of Hyper-Parameters},
2547 booktitle = {International Joint Conference on Neural Networks 2000},
2548 volume = {I},
2549 year = {2000},
2550 pages = {305--310},
2551 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyper-ijcnn2000.pdf},
2552 abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
2553 topics={ModelSelection},cat={C},
2554 }
2555
2556 @INPROCEEDINGS{ICML01,
2557 editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
2558 title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
2559 booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
2560 year = {-1},
2561 publisher = {Morgan Kaufmann}
2562 }
2563
2564 @INPROCEEDINGS{ICML01-short,
2565 editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
2566 title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
2567 booktitle = {ICML'01},
2568 year = {-1},
2569 publisher = {Morgan Kaufmann}
2570 }
2571
2572
2573 @INPROCEEDINGS{ICML02,
2574 editor = {Sammut, Claude and Hoffmann, Achim G.},
2575 title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
2576 booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
2577 year = {-1},
2578 publisher = {Morgan Kaufmann}
2579 }
2580
2581 @INPROCEEDINGS{ICML02-short,
2582 editor = {Sammut, Claude and Hoffmann, Achim G.},
2583 title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
2584 booktitle = {ICML'02},
2585 year = {-1},
2586 publisher = {Morgan Kaufmann}
2587 }
2588
2589
2590 @INPROCEEDINGS{ICML03,
2591 editor = {Fawcett, Tom and Mishra, Nina},
2592 title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
2593 booktitle = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
2594 year = {-1},
2595 publisher = {AAAI Press}
2596 }
2597
2598 @INPROCEEDINGS{ICML03-short,
2599 editor = {Fawcett, Tom and Mishra, Nina},
2600 title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
2601 booktitle = {ICML'03},
2602 year = {-1},
2603 publisher = {AAAI Press}
2604 }
2605
2606
2607 @INPROCEEDINGS{ICML04,
2608 editor = {Brodley, Carla E.},
2609 title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
2610 booktitle = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
2611 year = {-1},
2612 publisher = {ACM}
2613 }
2614
2615 @INPROCEEDINGS{ICML04-short,
2616 editor = {Brodley, Carla E.},
2617 title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
2618 booktitle = {ICML'04},
2619 year = {-1},
2620 publisher = {ACM}
2621 }
2622
2623
2624 @INPROCEEDINGS{ICML05-short,
2625 editor = {Raedt, Luc De and Wrobel, Stefan},
2626 title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
2627 booktitle = {ICML'05},
2628 year = {-1},
2629 publisher = {ACM}
2630 }
2631
2632
2633 @INPROCEEDINGS{ICML06-short,
2634 editor = {Cohen, William W. and Moore, Andrew},
2635 title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
2636 booktitle = {ICML'06},
2637 year = {-1},
2638 publisher = {ACM}
2639 }
2640
2641
2642 @INPROCEEDINGS{ICML07-short,
2643 editor = {Ghahramani, Zoubin},
2644 title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
2645 booktitle = {ICML'07},
2646 year = {-1},
2647 publisher = {ACM}
2648 }
2649
2650
2651 @INPROCEEDINGS{ICML08-short,
2652 editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
2653 title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
2654 booktitle = {ICML'08},
2655 year = {-1},
2656 publisher = {ACM}
2657 }
2658
2659
2660 @INPROCEEDINGS{ICML09-short,
2661 editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
2662 title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
2663 booktitle = {ICML'09},
2664 year = {-1},
2665 publisher = {ACM}
2666 }
2667
2668
2669 @INPROCEEDINGS{ICML96,
2670 editor = {Saitta, L.},
2671 title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
2672 booktitle = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
2673 year = {-1},
2674 publisher = {Morgan Kaufmann}
2675 }
2676
2677 @INPROCEEDINGS{ICML96-short,
2678 editor = {Saitta, L.},
2679 title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
2680 booktitle = {ICML'96},
2681 year = {-1},
2682 publisher = {Morgan Kaufmann}
2683 }
2684
2685
2686 @INPROCEEDINGS{ICML97,
2687 editor = {Fisher, Douglas H.},
2688 title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
2689 booktitle = {Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
2690 year = {-1},
2691 publisher = {Morgan Kaufmann}
2692 }
2693
2694 @INPROCEEDINGS{ICML97-short,
2695 editor = {Fisher, Douglas H.},
2696 title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
2697 booktitle = {ICML'97},
2698 year = {-1},
2699 publisher = {Morgan Kaufmann}
2700 }
2701
2702
2703 @INPROCEEDINGS{ICML98,
2704 editor = {Shavlik, Jude W.},
2705 title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
2706 booktitle = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
2707 year = {-1},
2708 publisher = {Morgan Kaufmann}
2709 }
2710
2711 @INPROCEEDINGS{ICML98-short,
2712 editor = {Shavlik, Jude W.},
2713 title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
2714 booktitle = {ICML'98},
2715 year = {-1},
2716 publisher = {Morgan Kaufmann}
2717 }
2718
2719
2720 @INPROCEEDINGS{ICML99,
2721 editor = {Bratko, Ivan and Dzeroski, Saso},
2722 title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
2723 booktitle = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
2724 year = {-1},
2725 publisher = {Morgan Kaufmann}
2726 }
2727
2728 @INPROCEEDINGS{ICML99-short,
2729 editor = {Bratko, Ivan and Dzeroski, Saso},
2730 title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
2731 booktitle = {ICML'99},
2732 year = {-1},
2733 publisher = {Morgan Kaufmann}
2734 }
2735
2736
2737 @INCOLLECTION{jaeger+eck:2007,
2738 author = {Jaeger, H. and Eck, Douglas},
2739 title = {Can't get you out of my head: {A} connectionist model of cyclic rehearsal},
2740 booktitle = {Modeling Communications with Robots and Virtual Humans},
2741 series = {{LNCS}},
2742 year = {2007},
2743 publisher = {Springer-Verlag},
2744 url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_jaeger_eck.pdf},
2745 source={OwnPublication},
2746 sourcetype={Chapter},
2747 }
2748
2749 @MISC{James+al-snowbird-2008,
2750 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
2751 title = {Image Classification using Higher-Order Neural Models},
2752 year = {2008},
2753 howpublished = {The Learning Workshop (Snowbird, Utah)},
2754 url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
2755 }
2756
2757
2758 @INPROCEEDINGS{Kegl+Bertin+Eck-2008,
2759 author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
2760 title = {Metropolis-Hastings Sampling in a FilterBoost Music Classifier},
2761 booktitle = {Music and machine learning workshop (ICML08)},
2762 year = {2008}
2763 }
2764
2765 @INPROCEEDINGS{kegl2005b,
2766 author = {K{\'{e}}gl, Bal{\'{a}}zs},
2767 title = {Generalization Error and Algorithmic Convergence of Median Boosting.},
2768 year = {2005},
2769 crossref = {NIPS17-shorter},
2770 abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.}
2771 }
2772
2773 @ARTICLE{lacoste+eck:eurasip,
2774 author = {Lacoste, Alexandre and Eck, Douglas},
2775 title = {A Supervised Classification Algorithm For Note Onset Detection},
2776 journal = {EURASIP Journal on Applied Signal Processing},
2777 volume = {2007},
2778 number = {ID 43745},
2779 year = {2007},
2780 pages = {1--13},
2781 source={OwnPublication},
2782 sourcetype={Journal},
2783 }
2784
2785 @MASTERSTHESIS{Lajoie2009,
2786 author = {Lajoie, Isabelle},
2787 keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, auto-encodeur d{\'{e}}bruiteur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
2788 title = {Apprentissage de repr{\'{e}}sentations sur-compl{\`{e}}tes par entra{\^{\i}}nement d’auto-encodeurs},
2789 year = {2009},
2790 school = {Universit{\'{e}} de Montr{\'{e}}al},
2791 abstract = {Les avanc{\'{e}}s dans le domaine de l’intelligence artificielle, permettent {\`{a}} des syst{\`{e}}mes
2792 informatiques de r{\'{e}}soudre des t{\^{a}}ches de plus en plus complexes li{\'{e}}es par exemple {\`{a}}
2793 la vision, {\`{a}} la compr{\'{e}}hension de signaux sonores ou au traitement de la langue. Parmi
2794 les mod{\`{e}}les existants, on retrouve les R{\'{e}}seaux de Neurones Artificiels (RNA), dont la
2795 popularit{\'{e}} a fait un grand bond en avant avec la d{\'{e}}couverte de Hinton et al. [22], soit
2796 l’utilisation de Machines de {Boltzmann} Restreintes (RBM) pour un pr{\'{e}}-entra{\^{\i}}nement
2797 non-supervis{\'{e}} couche apr{\`{e}}s couche, facilitant grandement l’entra{\^{\i}}nement supervis{\'{e}} du
2798 r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es (DBN), entra{\^{\i}}nement qui s’av{\'{e}}rait jusqu’alors tr{\`{e}}s
2799 difficile {\`{a}} r{\'{e}}ussir. Depuis cette d{\'{e}}couverte, des chercheurs ont {\'{e}}tudi{\'{e}} l’efficacit{\'{e}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement, telles que l’empilement d’auto-encodeurs traditionnels (SAE) [5, 38], et l’empilement d’auto-encodeur d{\'{e}}bruiteur (SDAE) [44].
2800 C’est dans ce contexte qu’a d{\'{e}}but{\'{e}} la pr{\'{e}}sente {\'{e}}tude. Apr{\`{e}}s un bref passage en revue des notions de base du domaine de l’apprentissage machine et des m{\'{e}}thodes de
2801 pr{\'{e}}-entra{\^{\i}}nement employ{\'{e}}es jusqu’{\`{a}} pr{\'{e}}sent avec les modules RBM, AE et DAE, nous
2802 avons approfondi notre compr{\'{e}}hension du pr{\'{e}}-entra{\^{\i}}nement de type SDAE, explor{\'{e}} ses
2803 diff{\'{e}}rentes propri{\'{e}}t{\'{e}}s et {\'{e}}tudi{\'{e}} des variantes de SDAE comme strat{\'{e}}gie d’initialisation
2804 d’architecture profonde. Nous avons ainsi pu, entre autres choses, mettre en lumi{\`{e}}re
2805 l’influence du niveau de bruit, du nombre de couches et du nombre d’unit{\'{e}}s cach{\'{e}}es
2806 sur l’erreur de g{\'{e}}n{\'{e}}ralisation du SDAE. Nous avons constat{\'{e}} une am{\'{e}}lioration de la
2807 performance sur la t{\^{a}}che supervis{\'{e}}e avec l’utilisation des bruits poivre et sel (PS) et
2808 gaussien (GS), bruits s’av{\'{e}}rant mieux justifi{\'{e}}s que celui utilis{\'{e}} jusqu’{\`{a}} pr{\'{e}}sent, soit le
2809 masque {\`{a}} z{\'{e}}ro (MN). De plus, nous avons d{\'{e}}montr{\'{e}} que la performance profitait d’une
2810 emphase impos{\'{e}}e sur la reconstruction des donn{\'{e}}es corrompues durant l’entra{\^{\i}}nement
2811 des diff{\'{e}}rents DAE. Nos travaux ont aussi permis de r{\'{e}}v{\'{e}}ler que le DAE {\'{e}}tait en mesure d’apprendre, sur des images naturelles, des filtres semblables {\`{a}} ceux retrouv{\'{e}}s dans
2812 les cellules V1 du cortex visuel, soit des filtres d{\'{e}}tecteurs de bordures. Nous aurons par
2813 ailleurs pu montrer que les repr{\'{e}}sentations apprises du SDAE, compos{\'{e}}es des caract{\'{e}}ristiques ainsi extraites, s’av{\'{e}}raient fort utiles {\`{a}} l’apprentissage d’une machine {\`{a}} vecteurs de
2814 support ({SVM}) lin{\'{e}}aire ou {\`{a}} noyau gaussien, am{\'{e}}liorant grandement sa performance de
2815 g{\'{e}}n{\'{e}}ralisation. Aussi, nous aurons observ{\'{e}} que similairement au DBN, et contrairement
2816 au SAE, le SDAE poss{\'{e}}dait une bonne capacit{\'{e}} en tant que mod{\`{e}}le g{\'{e}}n{\'{e}}rateur. Nous
2817 avons {\'{e}}galement ouvert la porte {\`{a}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement et d{\'{e}}couvert le potentiel de l’une d’entre elles, soit l’empilement d’auto-encodeurs rebruiteurs
2818 (SRAE).}
2819 }
2820
2821 @INPROCEEDINGS{lamere+eck:ismir2007,
2822 author = {Lamere, Paul and Eck, Douglas},
2823 editor = {},
2824 title = {Using 3D Visualizations to Explore and Discover Music},
2825 booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
2826 year = {2007},
2827 publisher = {},
2828 source={OwnPublication},
2829 sourcetype={Conference},
2830 }
2831
2832 @ARTICLE{Larochelle+al-2010,
2833 author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph},
2834 title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest},
2835 journal = {Neural Computation},
2836 year = {2010},
2837 note = {To appear}
2838 }
2839
2840 @INPROCEEDINGS{Larochelle+Bengio-2008,
2841 author = {Larochelle, Hugo and Bengio, Yoshua},
2842 title = {Classification using Discriminative Restricted {B}oltzmann Machines},
2843 year = {2008},
2844 pages = {536--543},
2845 crossref = {ICML08-shorter},
2846 abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
2847 for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
2848 this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
2849 RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
2850 approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
2851 }
2852
2853 @INPROCEEDINGS{Larochelle-2009,
2854 author = {Larochelle, Hugo and Erhan, Dumitru and Vincent, Pascal},
2855 title = {Deep Learning using Robust Interdependent Codes},
2856 booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
2857 year = {2009},
2858 pages = {312--319},
2859 date = "April 16-18, 2009",
2860 }
2861
2862 @ARTICLE{Larochelle-jmlr-2009,
2863 author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
2864 title = {Exploring Strategies for Training Deep Neural Networks},
2865 volume = {10},
2866 year = {2009},
2867 pages = {1--40},
2868 journal = {Journal of Machine Learning Research},
2869 abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
2870 }
2871
2872 @PHDTHESIS{Larochelle-PhD-2009,
2873 author = {Larochelle, Hugo},
2874 keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, autoassociateur, autoencodeur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
2875 title = {{\'{E}}tude de techniques d'apprentissage non-supervis{\'{e}} pour l'am{\'{e}}lioration de l'entra{\^{\i}}nement supervis{\'{e}} de mod{\`{e}}les connexionnistes},
2876 year = {2009},
2877 school = {University of Montr{\'{e}}al},
2878 abstract = {Le domaine de l'intelligence artificielle a pour objectif le d{\'{e}}veloppement de syst{\`{e}}mes informatiques capables de simuler des comportements normalement associ{\'{e}}s {\`{a}} l'intelligence humaine. On aimerait entre autres pouvoir construire une machine qui puisse
2879 r{\'{e}}soudre des t{\^{a}}ches li{\'{e}}es {\`{a}} la vision (e.g., la reconnaissance d'objet), au traitement de la langue (e.g., l'identification du sujet d'un texte) ou au traitement de signaux sonores (e.g., la reconnaissance de la parole).
2880 Une approche d{\'{e}}velopp{\'{e}}e afin de r{\'{e}}soudre ce genre de t{\^{a}}ches est bas{\'{e}}e sur l'apprentissage automatique de mod{\`{e}}les {\`{a}} partir de donn{\'{e}}es {\'{e}}tiquet{\'{e}}es refl{\'{e}}tant le comportement intelligent {\`{a}} {\'{e}}muler. Entre autre, il a {\'{e}}t{\'{e}} propos{\'{e}} de mod{\'{e}}liser le calcul n{\'{e}}cessaire {\`{a}} la
2881 r{\'{e}}solution d'une t{\^{a}}che {\`{a}} l'aide d'un r{\'{e}}seau de neurones artificiel, dont il est possible d'adapter le comportement {\`{a}} l'aide de la r{\'{e}}tropropagation [99, 131] d'un gradient informatif sur les erreurs commises par le r{\'{e}}seau. Populaire durant les ann{\'{e}}es 80, cette
2882 approche sp{\'{e}}cifique a depuis perdu partiellement de son attrait, suite au d{\'{e}}veloppement des m{\'{e}}thodes {\`{a}} noyau. Celles-ci sont souvent plus stables, plus faciles {\`{a}} utiliser et leur performance est souvent au moins aussi {\'{e}}lev{\'{e}}e pour une vaste gamme de probl{\`{e}}mes.
2883 Les m{\'{e}}thodes d'apprentissage automatique ont donc progress{\'{e}} dans leur fonctionnement, mais aussi dans la complexit{\'{e}} des probl{\`{e}}mes auxquels elles se sont attaqu{\'{e}}. Ainsi, plus r{\'{e}}cemment, des travaux [12, 15] ont commenc{\'{e}} {\`{a}} {\'{e}}mettre des doutes sur la capacit{\'{e}} des machines {\`{a}} noyau {\`{a}} pouvoir efficacement r{\'{e}}soudre des probl{\`{e}}mes de la complexit{\'{e}} requise par l'intelligence artificielle. Parall{\`{e}}lement, Hinton et al. [81] faisaient une perc{\'{e}}e dans l'apprentissage automatique de r{\'{e}}seaux de neurones, en proposant une proc{\'{e}}dure permettant l'entra{\^{\i}}nement de r{\'{e}}seaux de neurones d'une plus grande complexit{\'{e}} (i.e., avec plus de couches de neurones cach{\'{e}}es) qu'il n'{\'{e}}tait possible auparavant.
2884 C'est dans ce contexte qu'ont {\'{e}}t{\'{e}} conduits les travaux de cette th{\`{e}}se. Cette th{\`{e}}se d{\'{e}}bute par une exposition des principes de base de l'apprentissage automatique (chapitre 1) et une discussion des obstacles {\`{a}} l'obtention d'un mod{\`{e}}le ayant une bonne performance
2885 de g{\'{e}}n{\'{e}}ralisation (chapitre 2). Puis, sont pr{\'{e}}sent{\'{e}}es les contributions apport{\'{e}}es dans le cadre de cinq articles, contributions qui sont toutes bas{\'{e}}es sur l'utilisation d'une certaine
2886 forme d'apprentissage non-supervis{\'{e}}.
2887 Le premier article (chapitre 4) propose une m{\'{e}}thode d'entra{\^{\i}}nement pour un type sp{\'{e}}cifique de r{\'{e}}seau {\`{a}} une seule couche cach{\'{e}}e (la machine de {Boltzmann} restreinte) bas{\'{e}}e sur une combinaison des apprentissages supervis{\'{e}} et non-supervis{\'{e}}. Cette m{\'{e}}thode permet d'obtenir une meilleure performance de g{\'{e}}n{\'{e}}ralisation qu'un r{\'{e}}seau de neurones standard ou qu'une machine {\`{a}} vecteurs de support {\`{a}} noyau, et met en {\'{e}}vidence de fa{\c c}on
2888 explicite les b{\'{e}}n{\'{e}}fices qu'apporte l'apprentissage non-supervis{\'{e}} {\`{a}} l'entra{\^{\i}}nement d'un r{\'{e}}seau de neurones.
2889 Ensuite, dans le second article (chapitre 6), on {\'{e}}tudie et {\'{e}}tend la proc{\'{e}}dure d'entra{\^{\i}}nement propos{\'{e}}e par Hinton et al. [81]. Plus sp{\'{e}}cifiquement, on y propose une approche diff{\'{e}}rente mais plus flexible pour initialiser un r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es, bas{\'{e}}e sur un r{\'{e}}seau autoassociateur. On y explore aussi l'impact du nombre de couches et de neurones par couche sur la performance d'un r{\'{e}}seau et on y d{\'{e}}crit diff{\'{e}}rentes variantes mieux adapt{\'{e}}es {\`{a}} l'apprentissage en ligne ou pour donn{\'{e}}es {\`{a}} valeurs continues.
2890 Dans le troisi{\`{e}}me article (chapitre 8), on explore plut{\^{o}}t la performance de r{\'{e}}seaux profonds sur plusieurs probl{\`{e}}mes de classification diff{\'{e}}rents. Les probl{\`{e}}mes choisis ont la propri{\'{e}}t{\'{e}} d'avoir {\'{e}}t{\'{e}} g{\'{e}}n{\'{e}}r{\'{e}}s {\`{a}} partir de plusieurs facteurs de variation. Cette propri{\'{e}}t{\'{e}}, qui caract{\'{e}}rise les probl{\`{e}}mes li{\'{e}}s {\`{a}} l'intelligence artificielle, pose difficult{\'{e}} aux machines {\`{a}} noyau, tel que confirm{\'{e}} par les exp{\'{e}}riences de cet article.
2891 Le quatri{\`{e}}me article (chapitre 10) pr{\'{e}}sente une am{\'{e}}lioration de l'approche bas{\'{e}}e sur les r{\'{e}}seaux autoassociateurs. Cette am{\'{e}}lioration applique une modification simple {\`{a}} la proc{\'{e}}dure d'entra{\^{\i}}nement d'un r{\'{e}}seau autoassociateur, en « bruitant » les entr{\'{e}}es du r{\'{e}}seau afin que celui-ci soit forc{\'{e}} {\`{a}} la d{\'{e}}bruiter.
2892 Le cinqui{\`{e}}me et dernier article (chapitre 12) apporte une autre am{\'{e}}lioration aux r{\'{e}}seaux autoassociateurs, en permettant des interactions d'inhibition ou d'excitation entre les neurones cach{\'{e}}s de ces r{\'{e}}seaux. On y d{\'{e}}montre que de telles interactions peuvent
2893 {\^{e}}tre apprises et sont b{\'{e}}n{\'{e}}fiques {\`{a}} la performance d'un r{\'{e}}seau profond.}
2894 }
2895
2896 @INPROCEEDINGS{Larochelle2008,
2897 author = {Larochelle, Hugo and Erhan, Dumitru and Bengio, Yoshua},
2898 title = {Zero-data Learning of New Tasks},
2899 booktitle = {AAAI Conference on Artificial Intelligence},
2900 year = {2008},
2901 url = {http://www-etud.iro.umontreal.ca/~larocheh/publications/aaai2008_zero-data.pdf},
2902 abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
2903 for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
2904 this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
2905 RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
2906 approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
2907 }
2908
2909 @INPROCEEDINGS{LarochelleH2007,
2910 author = {Larochelle, Hugo and Erhan, Dumitru and Courville, Aaron and Bergstra, James and Bengio, Yoshua},
2911 title = {An Empirical Evaluation of Deep Architectures on Problems with Many Factors of Variation},
2912 year = {2007},
2913 pages = {473--480},
2914 crossref = {ICML07-shorter},
2915 abstract = {Recently, several learning algorithms relying on models with deep architectures have been proposed. Though they have demonstrated impressive performance, to date, they have only been evaluated on relatively simple problems such as digit recognition in a controlled environment, for which many machine learning algorithms already report reasonable results. Here, we present a series of experiments which indicate that these models show promise in solving harder learning problems that exhibit many factors of variation. These models are compared with well-established algorithms such as Support Vector Machines and single hidden-layer feed-forward neural networks.}
2916 }
2917
2918 @MASTERSTHESIS{Latendresse-MSc,
2919 author = {Latendresse, Simon},
2920 title = {L'utilisation d'hyper-param{\`{e}}tres pour la selection de variables},
2921 year = {1999},
2922 school = {Universit{\'{e}} de Montreal, Dept. IRO},
2923 note = {(in French)}
2924 }
2925
2926 @MASTERSTHESIS{Lauzon99,
2927 author = {Lauzon, Vincent-Philippe},
2928 title = {Mod{\'{e}}les statistiques comme algorithmes d'apprentissage et {MMCC}s; pr{\'{e}}diction de s{\'{e}}ries financi{\`{e}}res},
2929 year = {1999},
2930 school = {D{\'{e}}epartement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
2931 crossref = {DIRO}
2932 }
2933
2934 @INPROCEEDINGS{lecun-93,
2935 author = {{LeCun}, Yann and Bengio, Yoshua and Henderson, Donnie and Weisbuch, A. and Weissman, H. and L., Jackel},
2936 title = {On-line handwriting recognition with neural networks: spatial representation versus temporal representation.},
2937 booktitle = {Proc. International Conference on handwriting and drawing.},
2938 year = {1993},
2939 publisher = {Ecole Nationale Superieure des Telecommunications},
2940 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-93.ps.gz},
2941 topics={PriorKnowledge,Speech},cat={C},
2942 }
2943
2944 @INPROCEEDINGS{lecun-99,
2945 author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
2946 editor = {Forsyth, D.},
2947 title = {Object Recognition with Gradient-Based Learning},
2948 booktitle = {Shape, Contour and Grouping in Computer Vision},
2949 year = {1999},
2950 pages = {319-345},
2951 publisher = {Springer},
2952 url = {orig/lecun-99.ps.gz},
2953 topics={PriorKnowledge,Speech},cat={B},
2954 }
2955
2956 @TECHREPORT{lecun-99b,
2957 author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
2958 title = {Gradient-Based Learning for Object Detection, Segmentation and Recognition},
2959 year = {1999},
2960 institution = {AT\&T Labs},
2961 url = {orig/lecun-99b.ps.gz},
2962 topics={Speech},cat={T},
2963 }
2964
2965 @INPROCEEDINGS{lecun-bengio-94,
2966 author = {{LeCun}, Yann and Bengio, Yoshua},
2967 title = {Word-level training of a handwritten word recognizer based on convolutional neural networks},
2968 booktitle = {Proc. of the International Conference on Pattern Recognition},
2969 volume = {II},
2970 year = {1994},
2971 pages = {88--92},
2972 publisher = {IEEE},
2973 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-word.ps},
2974 abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. Words are represented by low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
2975 topics={Speech},cat={C},
2976 }
2977
2978 @INPROCEEDINGS{lecun-bengio-95a,
2979 author = {{LeCun}, Yann and Bengio, Yoshua},
2980 editor = {Arbib, M. A.},
2981 title = {Convolutional Networks for Images, Speech, and Time-Series},
2982 booktitle = {The Handbook of Brain Theory and Neural Networks},
2983 year = {1995},
2984 pages = {255--257},
2985 publisher = {MIT Press},
2986 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-convo.pdf},
2987 topics={PriorKnowledge,Speech},cat={C},
2988 }
2989
2990 @INCOLLECTION{lecun-bengio-95b,
2991 author = {{LeCun}, Yann and Bengio, Yoshua},
2992 editor = {Arbib, M. A.},
2993 title = {Pattern Recognition and Neural Networks},
2994 booktitle = {The Handbook of Brain Theory and Neural Networks},
2995 year = {1995},
2996 pages = {711--714},
2997 publisher = {MIT Press},
2998 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-patrec.pdf},
2999 topics={PriorKnowledge,Speech},cat={B},
3000 }
3001
3002 @ARTICLE{LeCun98,
3003 author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
3004 title = {Gradient-Based Learning Applied to Document Recognition},
3005 journal = {Proceedings of the IEEE},
3006 volume = {86},
3007 number = {11},
3008 year = {1998},
3009 pages = {2278--2324},
3010 abstract = {Multilayer Neural Networks trained with the backpropagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques.
3011 Real-life document recognition systems are composed or multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to minimize an overall performance measure.
3012 Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks.
3013 A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with global training techniques to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
3014 topics={PriorKnowledge,Speech},cat={C},
3015 }
3016
3017 @INPROCEEDINGS{Lecun_icassp97,
3018 author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
3019 title = {Reading Checks with graph transformer networks},
3020 booktitle = {International Conference on Acoustics, Speech and Signal Processing},
3021 volume = {1},
3022 year = {1997},
3023 pages = {151--154},
3024 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-bottou-bengio-97.ps.gz},
3025 topics={Speech},cat={C},
3026 }
3027
3028 @ARTICLE{LeRoux+Bengio-2010,
3029 author = {Le Roux, Nicolas and Bengio, Yoshua},
3030 title = {Deep Belief Networks are Compact Universal Approximators},
3031 journal = {Neural Computation},
3032 year = {2010},
3033 note = {To appear}
3034 }
3035
3036 @TECHREPORT{LeRoux-Bengio-2007-TR,
3037 author = {Le Roux, Nicolas and Bengio, Yoshua},
3038 title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
3039 number = {1294},
3040 year = {2007},
3041 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3042 abstract = {Deep Belief Networks (DBN) are generative neural network models with
3043 many layers of hidden explanatory factors, recently introduced by Hinton et al.,
3044 along with a greedy layer-wise unsupervised learning algorithm. The building
3045 block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine
3046 (RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines
3047 are interesting because inference is easy in them, and because they have been
3048 successfully used as building blocks for training deeper models.
3049 We first prove that adding hidden units yields strictly improved modeling
3050 power, while a second theorem shows that RBMs are universal approximators of
3051 discrete distributions. We then study the question of whether DBNs with more
3052 layers are strictly more powerful in terms of representational power. This
3053 suggests a new and less greedy criterion for training RBMs within DBNs.}
3054 }
3055
3056 @ARTICLE{LeRoux-Bengio-2008,
3057 author = {Le Roux, Nicolas and Bengio, Yoshua},
3058 title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
3059 journal = {Neural Computation},
3060 volume = {20},
3061 number = {6},
3062 year = {2008},
3063 pages = {1631--1649},
3064 abstract = {Deep Belief Networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton et al., along with a greedy layer-wise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine (RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines are interesting because inference is easy in them, and because they have been successfully used as building blocks for training deeper models. We first prove that adding hidden units yields strictly improved modelling power, while a second theorem shows that RBMs are universal approximators of discrete distributions. We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}
3065 }
3066
3067 @INPROCEEDINGS{LeRoux-continuous,
3068 author = {Le Roux, Nicolas and Bengio, Yoshua},
3069 title = {Continuous Neural Networks},
3070 booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
3071 year = {2007},
3072 publisher = {Omnipress},
3073 abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based learning. While having the same number of parameters as an ordinary neural network, its internal structure suggests that it can represent some smooth functions much more compactly. Under mild assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this parametrization may help reducing the problem of saturation of the neurons. In a second approach, the input-to-hidden weights arefully non-parametric, yielding a kernel machine for which we demonstrate a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free and still generalizes in spite of an absence of explicit regularization.}
3074 }
3075
3076 @PHDTHESIS{LeRoux-PhD-2008,
3077 author = {Le Roux, Nicolas},
3078 title = {Avanc{\'{e}}es th{\'{e}}oriques sur la repr{\'{e}}sentation et l'optimisation des r{\'{e}}seaux de neurones},
3079 year = {2008},
3080 school = {Universit{\'{e}} de Montr{\'{e}}al},
3081 abstract = {Les r{\'{e}}seaux de neurones artificiels ont {\'{e}}t{\'{e}} abondamment utilis{\'{e}}s dans la communaut{\'{e}} de l'apprentissage machine depuis les ann{\'{e}}es 80. Bien qu'ils aient {\'{e}}t{\'{e}} {\'{e}}tudi{\'{e}}s pour la premi{\`{e}}re fois il y a cinquante ans par Rosenblatt [68], ils ne furent r{\'{e}}ellement populaires qu'apr{\`{e}}s l'apparition de la r{\'{e}}tropropagation du gradient, en 1986 [71].
3082 En 1989, il a {\'{e}}t{\'{e}} prouv{\'{e}} [44] qu'une classe sp{\'{e}}cifique de r{\'{e}}seaux de neurones (les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e) {\'{e}}tait suffisamment puissante pour pouvoir approximer presque n'importe quelle fonction avec une pr{\'{e}}cision arbitraire : le th{\'{e}}or{\`{e}}me d'approximation universelle. Toutefois, bien que ce th{\'{e}}or{\`{e}}me e{\^{u}}t pour cons{\'{e}}quence un int{\'{e}}r{\^{e}}t accru pour les r{\'{e}}seaux de neurones, il semblerait qu'aucun effort n'ait {\'{e}}t{\'{e}} fait pour profiter de cette propri{\'{e}}t{\'{e}}.
3083 En outre, l'optimisation des r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e n'est pas convexe. Cela a d{\'{e}}tourn{\'{e}} une grande partie de la communaut{\'{e}} vers d'autres algorithmes, comme par exemple les machines {\`{a}} noyau (machines {\`{a}} vecteurs de support et r{\'{e}}gression
3084 {\`{a}} noyau, entre autres).
3085 La premi{\`{e}}re partie de cette th{\`{e}}se pr{\'{e}}sentera les concepts d'apprentissage machine g{\'{e}}n{\'{e}}raux n{\'{e}}cessaires {\`{a}} la compr{\'{e}}hension des algorithmes utilis{\'{e}}s. La deuxi{\`{e}}me partie se focalisera plus sp{\'{e}}cifiquement sur les m{\'{e}}thodes {\`{a}} noyau et les r{\'{e}}seaux de neurones. La troisi{\`{e}}me partie de ce travail visera ensuite {\`{a}} {\'{e}}tudier les limitations des machines {\`{a}} noyaux et {\`{a}} comprendre les raisons pour lesquelles elles sont inadapt{\'{e}}es {\`{a}} certains probl{\`{e}}mes que nous avons {\`{a}} traiter.
3086 La quatri{\`{e}}me partie pr{\'{e}}sente une technique permettant d'optimiser les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e de mani{\`{e}}re convexe. Bien que cette technique s'av{\`{e}}re difficilement exploitable pour des probl{\`{e}}mes de grande taille, une version approch{\'{e}}e permet d'obtenir une bonne solution dans un temps raisonnable.
3087 La cinqui{\`{e}}me partie se concentre sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e infinie. Cela leur permet th{\'{e}}oriquement d'exploiter la propri{\'{e}}t{\'{e}} d'approximation universelle et ainsi d'approcher facilement une plus grande classe de fonctions.
3088 Toutefois, si ces deux variations sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e leur conf{\`{e}}rent des propri{\'{e}}t{\'{e}}s int{\'{e}}ressantes, ces derniers ne peuvent extraire plus que des concepts de bas niveau. Les m{\'{e}}thodes {\`{a}} noyau souffrant des m{\^{e}}mes limites, aucun de
3089 ces deux types d'algorithmes ne peut appr{\'{e}}hender des probl{\`{e}}mes faisant appel {\`{a}} l'apprentissage de concepts de haut niveau.
3090 R{\'{e}}cemment sont apparus les Deep Belief Networks [39] qui sont des r{\'{e}}seaux de neurones {\`{a}} plusieurs couches cach{\'{e}}es entra{\^{\i}}n{\'{e}}s de mani{\`{e}}re efficace. Cette profondeur leur permet d'extraire des concepts de haut niveau et donc de r{\'{e}}aliser des t{\^{a}}ches hors
3091 de port{\'{e}}e des algorithmes conventionnels. La sixi{\`{e}}me partie {\'{e}}tudie des propri{\'{e}}t{\'{e}}s de ces r{\'{e}}seaux profonds.
3092 Les probl{\`{e}}mes que l'on rencontre actuellement n{\'{e}}cessitent non seulement des algorithmes capables d'extraire des concepts de haut niveau, mais {\'{e}}galement des m{\'{e}}thodes d'optimisation capables de traiter l'immense quantit{\'{e}} de donn{\'{e}}es parfois disponibles, si possible en temps r{\'{e}}el. La septi{\`{e}}me partie est donc la pr{\'{e}}sentation d'une nouvelle technique permettant une optimisation plus rapide.}
3093 }
3094
3095 @ARTICLE{lheureux-04,
3096 author = {{L'Heureux}, Pierre-Jean and Carreau, Julie and Bengio, Yoshua and Delalleau, Olivier and Yue, Shi Yi},
3097 title = {Locally Linear Embedding for dimensionality reduction in {QSAR}},
3098 journal = {Journal of Computer-Aided Molecular Design},
3099 volume = {18},
3100 year = {2004},
3101 pages = {475--482},
3102 abstract = {Current practice in Quantitative Structure Activity Relationship (QSAR) methods usually involves generating a great number of chemical descriptors and then cutting them back with variable selection techniques. Variable selection is an effective method to reduce the dimensionality but may discard some valuable information. This paper introduces Locally Linear Embedding ({LLE}), a local non-linear dimensionality reduction technique, that can statistically discover a low-dimensional representation of the chemical data. {LLE} is shown to create more stable representations than other non-linear dimensionality
3103 reduction algorithms, and to be capable of capturing non-linearity in chemical data.},
3104 topics={Bioinformatic},cat={J},
3105 }
3106
3107 @TECHREPORT{lm-TR00,
3108 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
3109 title = {A Neural Probabilistic Language Model},
3110 number = {1178},
3111 year = {2000},
3112 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3113 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1178.pdf},
3114 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made or words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model, and that the proposed approach allows to take advantage of much longer contexts.},
3115 topics={Markov,Unsupervised,Language},cat={T},
3116 }
3117
3118 @INPROCEEDINGS{Maillet+al-2009,
3119 author = {Maillet, Fran{\c c}ois and Eck, Douglas and Desjardins, Guillaume and Lamere, Paul},
3120 title = {Steerable Playlist Generation by Learning Song Similarity from Radio Station Playlists},
3121 booktitle = {Proceedings of the 10th International Conference on Music Information Retrieval},
3122 year = {2009},
3123 url = {http://www-etud.iro.umontreal.ca/~mailletf/papers/ismir09-playlist.pdf},
3124 abstract = {This paper presents an approach to generating steerable playlists. We first demonstrate a method for learning song transition probabilities from audio features extracted from songs played in professional radio station playlists. We then show that by using this learnt similarity function as a prior, we are able to generate steerable playlists by choosing the next song to play not simply based on that prior, but on a tag cloud that the user is able to manipulate to express the high-level characteristics of the music he wishes Last.fm, to listen to.}
3125 }
3126
3127 @INPROCEEDINGS{manzagol+bertinmahieux+eck:ismir2008,
3128 author = {Manzagol, Pierre-Antoine and Bertin-Mahieux, Thierry and Eck, Douglas},
3129 title = {On the Use of Sparse Time-Relative Auditory Codes for Music},
3130 booktitle = {{Proceedings of the 9th International Conference on Music Information Retrieval ({ISMIR} 2008)}},
3131 year = {2008},
3132 abstract = {Many if not most audio features used in MIR research are inspired by work done in speech recognition and are variations on the spectrogram. Recently, much attention has been given to new representations of audio that are sparse and time-relative. These representations are efficient and able to avoid the time-frequency trade-off of a spectrogram. Yet little work with music streams has been conducted and these features remain mostly unused in the MIR community. In this paper we further explore the use of these features for musical signals. In particular, we investigate their use on realistic music examples (i.e. released commercial music) and their use as input features for supervised learning. Furthermore, we identify three specific issues related to these features which will need to be further addressed in order to obtain the full benefit for MIR applications.},
3133 source={OwnPublication},
3134 sourcetype={Conference},
3135 }
3136
3137 @MASTERSTHESIS{Manzagol-Msc-2007,
3138 author = {Manzagol, Pierre-Antoine},
3139 key = {Algorithme d'apprentissage, méthode de second ordre, gradient naturel, approximation stochastique},
3140 title = {TONGA - Un algorithme de gradient naturel pour les probl{\`{e}}mes de grande taille},
3141 year = {2007},
3142 school = {Universit{\'{e}} de Montr{\'{e}}al},
3143 abstract = {Les syst{\`{e}}mes adaptatifs sont confront{\'{e}}s {\`{a}} des donn{\'{e}}es qui {\'{e}}voluent rapidement en quantit{\'{e}} et en complexit{\'{e}}. Les avanc{\'{e}}es mat{\'{e}}rielles de l'informatique ne susent pas {\`{a}} compenser cet essor. Une mise {\`{a}} l'{\'{e}}chelle des techniques d'apprentissage est n{\'{e}}cessaire. D'une part, les mod{\`{e}}les doivent gagner en capacit{\'{e}} de repr{\'{e}}sentation. De l'autre, les algorithmes d'apprentissage doivent devenir plus ecaces.
3144 Nos travaux se situent dans ce contexte des probl{\`{e}}mes de grande taille et portent sur l'am{\'{e}}lioration des algorithmes d'apprentissage. Deux {\'{e}}l{\'{e}}ments de r{\'{e}}ponse sont d{\'{e}}j{\`{a}} connus. Il s'agit des m{\'{e}}thodes de second ordre et de l'approximation stochastique. Or, les m{\'{e}}thodes de second ordre poss{\`{e}}dent des complexit{\'{e}}s en calculs et en m{\'{e}}moire qui sont prohibitives dans le cadre des probl{\`{e}}mes de grande taille. {\'{E}}galement, il est notoirement dicile de concilier ces m{\'{e}}thodes avec l'approximation stochastique. TONGA est un algorithme d'apprentissage con{\c c}u pour faire face {\`{a}} ces dicult{\'{e}}s. Il s'agit d'une implantation stochastique et adapt{\'{e}}e aux probl{\`{e}}mes de grande taille d'une m{\'{e}}thode de second ordre, le gradient naturel. Dans ce m{\'{e}}moire, nous examinons de pr{\`{e}}s ce nouvel algorithme d'apprentissage en le comparant sur plusieurs probl{\`{e}}mes au gradient stochastique, la technique d'optimisation commun{\'{e}}ment utilis{\'{e}}e dans le cadre des probl{\`{e}}mes de grande taille. Nos exp{\'{e}}riences montrent que TONGA est au moins tout aussi ecace que le gradient stochastique, ce qui est un accomplissement en soit. Dans certains cas, TONGA offre une convergence nettement sup{\'{e}}rieure {\`{a}} celle du gradient stochastique.}
3145 }
3146
3147 @INPROCEEDINGS{matic-94,
3148 author = {Matic, N. and Henderson, Donnie and {LeCun}, Yann and Bengio, Yoshua},
3149 title = {Pen-based visitor registration system (PENGUIN)},
3150 booktitle = {Conference Record of the Twenty-Eighth Asilomar Conference on Signals, Systems and Computers},
3151 year = {1994},
3152 publisher = {IEEE},
3153 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/matic-94.tiff},
3154 abstract = {We describe a new electronic pen-based visitors registration system (PENGUIN) whose goal is to expand and modernize the visitor sign-in procedure at Bell Laboratories. The system uses a pen-interface (i.e. tablet-display) in what is essentially a form filling application. Our pen-interface is coupled with a powerful and accurate on-line handwriting recognition module. A database of AT&T employees (the visitors' hosts) and country names is used to check the recognition module outputs, in order to find the best match. The system provides assistance to the guard at one of the guard stations in routing visitors to their hosts. All the entered data are stored electronically. Initial testing shows that PENGUIN system performs reliably and with high accuracy. It retrieves the correct host name with 97\% accuracy and the correct visitors citizenship with 99\% accuracy. The system is robust and easy to use for both visitors and guards},
3155 topics={Speech},cat={C},
3156 }
3157
3158 @UNPUBLISHED{mirex2005artist,
3159 author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
3160 title = {Artist Recognition: A Timbre- and Rhythm-Based Multiresolution Approach},
3161 year = {2005},
3162 note = {{MIREX} artist recognition contest},
3163 source={OwnPublication},
3164 sourcetype={Other},
3165 }
3166
3167 @UNPUBLISHED{mirex2005genre,
3168 author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
3169 title = {Genre Classification: Timbre- and Rhythm-Based Multiresolution Audio Classification},
3170 year = {2005},
3171 note = {{MIREX} genre classification contest},
3172 source={OwnPublication},
3173 sourcetype={Other},
3174 }
3175
3176 @UNPUBLISHED{mirex2005note,
3177 author = {Lacoste, Alexandre and Eck, Douglas},
3178 title = {Onset Detection with Artificial Neural Networks},
3179 year = {2005},
3180 note = {{MIREX} note onset detection contest},
3181 source={OwnPublication},
3182 sourcetype={Other},
3183 }
3184
3185 @UNPUBLISHED{mirex2005tempo,
3186 author = {Eck, Douglas and Casagrande, Norman},
3187 title = {A Tempo-Extraction Algorithm Using an Autocorrelation Phase Matrix and Shannon Entropy},
3188 year = {2005},
3189 note = {{MIREX} tempo extraction contest (www.music-ir.org/\-evaluation/\-mirex-results)},
3190 source={OwnPublication},
3191 sourcetype={Other},
3192 }
3193
3194 @INPROCEEDINGS{mitacs-insurance01,
3195 author = {Bengio, Yoshua and Chapados, Nicolas and Dugas, Charles and Ghosn, Joumana and Takeuchi, Ichiro and Vincent, Pascal},
3196 title = {High-Dimensional Data Inference for Automobile Insurance Premia Estimation},
3197 booktitle = {Presented at the 2001 MITACS Annual Meeting},
3198 year = {2001},
3199 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mitacs_insurance.ps},
3200 topics={HighDimensional,Mining},cat={C},
3201 }
3202
3203 @INPROCEEDINGS{Morin+al-2005,
3204 author = {Morin, Frederic and Bengio, Yoshua},
3205 editor = {Cowell, Robert G. and Ghahramani, Zoubin},
3206 title = {Hierarchical Probabilistic Neural Network Language Model},
3207 booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
3208 year = {2005},
3209 pages = {246--252},
3210 publisher = {Society for Artificial Intelligence and Statistics},
3211 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf},
3212 abstract = {In recent years, variants of a neural network architecture for statistical language modeling have been proposed and successfully applied, e.g. in the language modeling component of speech recognizers. The main advantage of these architectures is that they learn an embedding for words (or other symbols) in a continuous space that helps to smooth the language model and provide good generalization even when the number of training examples is insufficient. However, these models are extremely slow in comparison to the more commonly used n-gram models, both for training and recognition. As an alternative to an importance sampling method proposed to speed-up training, we introduce a hierarchical decomposition of the conditional probabilities that yields a speed-up of about 200 both during training and recognition. The hierarchical decomposition is a binary hierarchical clustering constrained by the prior knowledge extracted from the WordNet semantic hierarchy.},
3213 topics={Language},cat={C},
3214 }
3215
3216 @TECHREPORT{Nadeau-inference-TR99,
3217 author = {Nadeau, Claude and Bengio, Yoshua},
3218 title = {Inference and the Generalization Error},
3219 number = {99s-45},
3220 year = {1999},
3221 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3222 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/techrep.pdf},
3223 abstract = {We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets and test examples. This allows us to propose two new estimators of this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered in (Dietterich, 1998). In particular, tests of hypothesis based on these don’t tend to be too liberal like other tests currently available, and have good power.},
3224 topics={Comparative},cat={T},
3225 }
3226
3227 @INPROCEEDINGS{nadeau:2000:nips,
3228 author = {Nadeau, Claude and Bengio, Yoshua},
3229 title = {Inference for the Generalization Error},
3230 year = {2000},
3231 pages = {307--313},
3232 crossref = {NIPS12-shorter},
3233 abstract = {In order to to compare learning algorithms, experimental results reported in the machine learning litterature often use statistical tests of significance. Unfortunately, most of these tests do not take into account the variability due to the choice of training set. We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets. This allows us to propose two new ways to estimate this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered by Dietterich (Dietterich, 1998).},
3234 topics={Comparative},cat={C},
3235 }
3236
3237 @ARTICLE{nadeau:2001,
3238 author = {Nadeau, Claude and Bengio, Yoshua},
3239 title = {Inference for the Generalization Error},
3240 journal = {Machine Learning},
3241 year = {2001},
3242 abstract = {In order to compare learning algorithms, experimental results reported in the machine learning literature often use statistical tests of significance to support the claim that a new learning algorithm generalizes better. Such tests should take into account the variability due to the choice of training set and not only that due to the test examples, as is often the case. This could lead to gross underestimation of the variance of the cross-validation estimator, and to the wrong conclusion that the new algorithm is significantly better when it is not. We perform a theoretical investigation of the variance of a cross-validation estimator of the generalization error that takes into account the variability due to the randomness of the training set as well as test examples. Our analysis shows that all the variance estimators that are based only on the results of the cross-validation experiment must be biased. This analysis allows us to propose new estimators of this variance. We show, via simulations, that tests of hypothesis about the generalization error using those new variance estimators have better properties than tests involving variance estimators currently in use and listed in (Dietterich, 1998). In particular, the new tests have correct size and good power. That is, the new tests do not reject the null hypothesis too often when the hypothesis is true, but they tend to frequently reject the null hypothesis when the latter is false.},
3243 topics={Comparative},cat={J},
3244 }
3245
3246 @ARTICLE{NC06,
3247 author = {Bengio, Yoshua and Monperrus, Martin and Larochelle, Hugo},
3248 title = {Nonlocal Estimation of Manifold Structure},
3249 journal = {Neural Computation},
3250 volume = {18},
3251 year = {2006},
3252 pages = {2509--2528},
3253 abstract = {We claim and present arguments to the effect that a large class of manifold
3254 learning algorithms that are essentially local and can be framed as
3255 kernel learning algorithms will suffer from the curse of dimensionality, at
3256 the dimension of the true underlying manifold. This observation suggests
3257 to explore non-local manifold learning algorithms which attempt to discover
3258 shared structure in the tangent planes at different positions. A criterion for
3259 such an algorithm is proposed and experiments estimating a tangent plane
3260 prediction function are presented, showing its advantages with respect to
3261 local manifold learning algorithms: it is able to generalize very far from
3262 training data (on learning handwritten character image rotations), where a
3263 local non-parametric method fails.},
3264 topics={HighDimensional,Kernel,Unsupervised},cat={J},
3265 }
3266
3267 @INPROCEEDINGS{NIPS1-short,
3268 editor = {Touretzky, D. S.},
3269 title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
3270 booktitle = {NIPS 1},
3271 year = {-1},
3272 publisher = {Morgan Kaufmann}
3273 }
3274
3275
3276 @INPROCEEDINGS{NIPS10-short,
3277 editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
3278 title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
3279 booktitle = {NIPS 10},
3280 year = {-1},
3281 publisher = {MIT Press}
3282 }
3283
3284
3285 @INPROCEEDINGS{NIPS11,
3286 editor = {Kearns, M.J. and Solla, S.A.},
3287 title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
3288 booktitle = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
3289 year = {-1},
3290 publisher = {MIT Press}
3291 }
3292
3293 @INPROCEEDINGS{NIPS11-short,
3294 editor = {Kearns, M.J. and Solla, S.A.},
3295 title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
3296 booktitle = {NIPS 11},
3297 year = {-1},
3298 publisher = {MIT Press}
3299 }
3300
3301
3302 @INPROCEEDINGS{NIPS12-short,
3303 editor = {Solla, S.A. and Leen, T. K.},
3304 title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
3305 booktitle = {NIPS 12},
3306 year = {-1},
3307 publisher = {MIT Press}
3308 }
3309
3310
3311 @INPROCEEDINGS{NIPS13-short,
3312 editor = {Leen, T. K. and Dietterich, T.G.},
3313 title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
3314 booktitle = {NIPS 13},
3315 year = {-1},
3316 publisher = {MIT Press}
3317 }
3318
3319
3320 @INPROCEEDINGS{NIPS14,
3321 editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
3322 title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
3323 booktitle = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
3324 year = {-1},
3325 publisher = {MIT Press}
3326 }
3327
3328 @INPROCEEDINGS{NIPS14-short,
3329 editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
3330 title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
3331 booktitle = {NIPS 14},
3332 year = {-1},
3333 publisher = {MIT Press}
3334 }
3335
3336
3337 @INPROCEEDINGS{NIPS15-short,
3338 editor = {Becker, S. and Thrun, Sebastian},
3339 title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
3340 booktitle = {NIPS 15},
3341 year = {-1},
3342 publisher = {MIT Press}
3343 }
3344
3345
3346 @INPROCEEDINGS{NIPS16-short,
3347 editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
3348 title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
3349 booktitle = {NIPS 16},
3350 year = {-1}
3351 }
3352
3353
3354 @INPROCEEDINGS{NIPS17-short,
3355 editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
3356 title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
3357 booktitle = {NIPS 17},
3358 year = {-1}
3359 }
3360
3361
3362 @INPROCEEDINGS{NIPS18-short,
3363 editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
3364 title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
3365 booktitle = {NIPS 18},
3366 year = {-1},
3367 publisher = {MIT Press}
3368 }
3369
3370
3371 @INPROCEEDINGS{NIPS19-short,
3372 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
3373 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
3374 booktitle = {NIPS 19},
3375 year = {-1},
3376 publisher = {MIT Press}
3377 }
3378
3379
3380 @INPROCEEDINGS{NIPS2-short,
3381 editor = {Touretzky, D. S.},
3382 title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
3383 booktitle = {NIPS 2},
3384 year = {-1},
3385 publisher = {Morgan Kaufmann}
3386 }
3387
3388
3389 @INPROCEEDINGS{NIPS20-short,
3390 editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
3391 title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
3392 booktitle = {NIPS 20},
3393 year = {-1},
3394 publisher = {MIT Press}
3395 }
3396
3397
3398 @INPROCEEDINGS{NIPS2003_AA65,
3399 author = {Bengio, Yoshua and Grandvalet, Yves},
3400 keywords = {cross validation, error bars, generalization error inference, k-fold cross-validation, model selection, statistical comparison of algorithms, variance estimate},
3401 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
3402 year = {2004},
3403 publisher = {MIT Press},
3404 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/var-kfold-part1-nips.pdf},
3405 crossref = {NIPS16},
3406 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare algorithm performances. In order to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the estimation of uncertainty around the K-fold cross-validation estimator. The main theorem shows that there exists no universal unbiased estimator of the variance of K-fold cross-validation. An analysis based on the eigendecomposition of the covariance matrix of errors helps to better understand the nature of the problem and shows that naive estimators may grossly underestimate variance, as confirmed by numerical experiments.},
3407 topics={Comparative},cat={C},
3408 }
3409
3410 @INCOLLECTION{NIPS2005_424,
3411 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
3412 title = {The Curse of Highly Variable Functions for Local Kernel Machines},
3413 year = {2006},
3414 pages = {107--114},
3415 crossref = {NIPS18-shorter},
3416 abstract = {We present a series of theoretical arguments supporting the claim that a
3417 large class of modern learning algorithms that rely solely on the smoothness
3418 prior – with similarity between examples expressed with a local
3419 kernel – are sensitive to the curse of dimensionality, or more precisely
3420 to the variability of the target. Our discussion covers supervised, semisupervised
3421 and unsupervised learning algorithms. These algorithms are
3422 found to be local in the sense that crucial properties of the learned function
3423 at x depend mostly on the neighbors of x in the training set. This
3424 makes them sensitive to the curse of dimensionality, well studied for
3425 classical non-parametric statistical learning. We show in the case of the
3426 Gaussian kernel that when the function to be learned has many variations,
3427 these algorithms require a number of training examples proportional to
3428 the number of variations, which could be large even though there may exist
3429 short descriptions of the target function, i.e. their Kolmogorov complexity
3430 may be low. This suggests that there exist non-local learning
3431 algorithms that at least have the potential to learn about such structured
3432 but apparently complex functions (because locally they have many variations),
3433 while not using very specific prior domain knowledge.},
3434 topics={HighDimensional,Kernel,Unsupervised},cat={C},
3435 }
3436
3437 @INPROCEEDINGS{NIPS2005_456,
3438 author = {K{\'{e}}gl, Bal{\'{a}}zs and Wang, Ligen},
3439 title = {Boosting on Manifolds: Adaptive Regularization of Base Classifiers},
3440 year = {2005},
3441 pages = {665--672},
3442 crossref = {NIPS17-shorter},
3443 abstract = {In this paper we propose to combine two powerful ideas, boosting and manifold learning. On the one hand, we improve ADABOOST by incorporating knowledge on the structure of the data into base classifier design and selection. On the other hand, we use ADABOOST’s efficient learning mechanism to significantly improve supervised and semi-supervised algorithms proposed in the context of manifold learning. Beside the specific manifold-based penalization, the resulting algorithm also accommodates the boosting of a large family of regularized learning algorithms.},
3444 topics={Boosting},cat={C},
3445 }
3446
3447 @INCOLLECTION{NIPS2005_519,
3448 author = {Grandvalet, Yves and Bengio, Yoshua},
3449 title = {Semi-supervised Learning by Entropy Minimization},
3450 year = {2005},
3451 pages = {529--236},
3452 crossref = {NIPS17-shorter},
3453 abstract = {We consider the semi-supervised learning problem, where a decision rule is to be learned from labeled and unlabeled data. In this framework, we motivate minimum entropy regularization, which enables to incorporate unlabeled data in the standard supervised learning. Our approach includes other approaches to the semi-supervised problem as particular or limiting cases. A series of experiments illustrates that the proposed solution benefits from unlabeled data. The method challenges mixture models when the data are sampled from the distribution class spanned by the generative model. The performances are definitely in favor of minimum entropy regularization when generative models are misspecified, and the weighting of unlabeled data provides robustness to the violation of the “cluster assumption”. Finally, we also illustrate that the method can also be far superior to manifold learning in high dimension spaces.},
3454 topics={Unsupervised},cat={C},
3455 }
3456
3457 @INPROCEEDINGS{NIPS2005_539,
3458 author = {Bengio, Yoshua and Larochelle, Hugo and Vincent, Pascal},
3459 title = {Non-Local Manifold Parzen Windows},
3460 year = {2006},
3461 crossref = {NIPS18-shorter},
3462 abstract = {To escape from the curse of dimensionality, we claim that one can learn
3463 non-local functions, in the sense that the value and shape of the learned
3464 function at x must be inferred using examples that may be far from x.
3465 With this objective, we present a non-local non-parametric density estimator.
3466 It builds upon previously proposed Gaussian mixture models with
3467 regularized covariance matrices to take into account the local shape of
3468 the manifold. It also builds upon recent work on non-local estimators of
3469 the tangent plane of a manifold, which are able to generalize in places
3470 with little training data, unlike traditional, local, non-parametric models.},
3471 topics={HighDimensional,Kernel,Unsupervised},cat={C},
3472 }
3473
3474 @INPROCEEDINGS{NIPS2005_583,
3475 author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
3476 title = {Convex Neural Networks},
3477 year = {2006},
3478 pages = {123--130},
3479 crossref = {NIPS18-shorter},
3480 abstract = {Convexity has recently received a lot of attention in the machine learning
3481 community, and the lack of convexity has been seen as a major disadvantage
3482 of many learning algorithms, such as multi-layer artificial neural
3483 networks. We show that training multi-layer neural networks in which the
3484 number of hidden units is learned can be viewed as a convex optimization
3485 problem. This problem involves an infinite number of variables, but can be
3486 solved by incrementally inserting a hidden unit at a time, each time finding
3487 a linear classifier that minimizes a weighted sum of errors.},
3488 topics={Boosting},cat={C},
3489 }
3490
3491 @INPROCEEDINGS{NIPS2005_663,
3492 author = {Rivest, Fran{\c c}ois and Bengio, Yoshua and Kalaska, John},
3493 title = {Brain Inspired Reinforcement Learning},
3494 year = {2005},
3495 pages = {1129--1136},
3496 crossref = {NIPS17-shorter},
3497 abstract = {Successful application of reinforcement learning algorithms often involves considerable hand-crafting of the necessary non-linear features to reduce the complexity of the value functions and hence to promote convergence of the algorithm. In contrast, the human brain readily and autonomously finds the complex features when provided with sufficient training. Recent work in machine learning and neurophysiology has demonstrated the role of the basal ganglia and the frontal cortex in mammalian reinforcement learning. This paper develops and explores new reinforcement learning algorithms inspired by neurological evidence that provides potential new approaches to the feature construction problem. The algorithms are compared and evaluated on the Acrobot task.},
3498 topics={BioRules},cat={C},
3499 }
3500
3501 @INCOLLECTION{NIPS2005_691,
3502 author = {Bengio, Yoshua and Monperrus, Martin},
3503 title = {Non-Local Manifold Tangent Learning},
3504 year = {2005},
3505 pages = {129--136},
3506 crossref = {NIPS17-shorter},
3507 abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local and can be framed as kernel learning algorithms will suffer from the curse of dimensionality, at the dimension of the true underlying manifold. This observation suggests to explore non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented, showing its advantages with respect to local manifold learning algorithms: it is able to generalize very far from training data (on learning handwritten character image rotations), where a local non-parametric method fails.},
3508 topics={HighDimensional,Unsupervised},cat={C},
3509 }
3510
3511 @INPROCEEDINGS{NIPS2005_874,
3512 author = {K{\'{e}}gl, Bal{\'{a}}zs},
3513 title = {Generalization Error and Algorithmic Convergence of Median Boosting},
3514 year = {2005},
3515 pages = {657--664},
3516 crossref = {NIPS17-shorter},
3517 abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.},
3518 topics={Boosting},cat={C},
3519 }
3520
3521 @INPROCEEDINGS{NIPS2007-56,
3522 author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
3523 title = {Topmoumoute online natural gradient algorithm},
3524 year = {2008},
3525 crossref = {NIPS20-shorter},
3526 abstract = {Guided by the goal of obtaining an optimization algorithm that is both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
3527 }
3528
3529 @INPROCEEDINGS{NIPS2007-812,
3530 author = {Chapados, Nicolas and Bengio, Yoshua},
3531 title = {Augmented Functional Time Series Representation and Forecasting with Gaussian Processes},
3532 year = {2008},
3533 pages = {265--272},
3534 crossref = {NIPS20-shorter},
3535 abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
3536 }
3537
3538 @INPROCEEDINGS{NIPS2007-925,
3539 author = {Le Roux, Nicolas and Bengio, Yoshua and Lamblin, Pascal and Joliveau, Marc and K{\'{e}}gl, Bal{\'{a}}zs},
3540 title = {Learning the 2-D Topology of Images},
3541 year = {2008},
3542 pages = {841--848},
3543 crossref = {NIPS20-shorter},
3544 abstract = {We study the following question: is the two-dimensional structure of images a very strong prior or is it something that can be learned with a few examples of natural images? If someone gave us a learning task involving images for which the two-dimensional topology of pixels was not known, could we discover it automatically and exploit it? For example suppose that the pixels had been permuted in a fixed but unknown way, could we recover the relative two-dimensional location of pixels on images? The surprising result presented here is that not only the answer is yes but that about as few as a thousand images are enough to approximately recover the relative locations of about a thousand pixels. This is achieved using a manifold learning algorithm applied to pixels associated with a measure of distributional similarity between pixel intensities. We compare different topologyextraction approaches and show how having the two-dimensional topology can be exploited.}
3545 }
3546
3547 @INPROCEEDINGS{NIPS21,
3548 editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
3549 title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
3550 booktitle = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
3551 year = {-1},
3552 publisher = {Nips Foundation (http://books.nips.cc)}
3553 }
3554
3555 @INPROCEEDINGS{NIPS21-short,
3556 editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
3557 title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
3558 booktitle = {NIPS 21},
3559 year = {-1},
3560 publisher = {Nips Foundation (http://books.nips.cc)}
3561 }
3562
3563
3564 @INPROCEEDINGS{NIPS22-short,
3565 editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
3566 title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
3567 booktitle = {NIPS 22},
3568 year = {-1}
3569 }
3570
3571
3572 @INPROCEEDINGS{NIPS3,
3573 editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
3574 title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
3575 booktitle = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
3576 year = {-1},
3577 publisher = {Morgan Kaufmann}
3578 }
3579
3580 @INPROCEEDINGS{NIPS3-short,
3581 editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
3582 title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
3583 booktitle = {NIPS 3},
3584 year = {-1},
3585 publisher = {Morgan Kaufmann}
3586 }
3587
3588
3589 @INPROCEEDINGS{NIPS4-short,
3590 editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
3591 title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
3592 booktitle = {NIPS 4},
3593 year = {-1},
3594 publisher = {Morgan Kaufmann}
3595 }
3596
3597
3598 @INPROCEEDINGS{NIPS5,
3599 editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
3600 title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
3601 booktitle = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
3602 year = {-1},
3603 publisher = {Morgan Kaufmann}
3604 }
3605
3606 @INPROCEEDINGS{NIPS5-short,
3607 editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
3608 title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
3609 booktitle = {NIPS 5},
3610 year = {-1},
3611 publisher = {Morgan Kaufmann}
3612 }
3613
3614
3615 @INPROCEEDINGS{NIPS6-short,
3616 editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
3617 title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
3618 booktitle = {NIPS 6},
3619 year = {-1},
3620 publisher = {MIT Press}
3621 }
3622
3623
3624 @INPROCEEDINGS{NIPS7-short,
3625 editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
3626 title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
3627 booktitle = {NIPS 7},
3628 year = {-1},
3629 publisher = {MIT Press}
3630 }
3631
3632
3633 @INPROCEEDINGS{NIPS8-short,
3634 editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
3635 title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
3636 booktitle = {NIPS 8},
3637 year = {-1},
3638 publisher = {MIT Press}
3639 }
3640
3641
3642 @INPROCEEDINGS{NIPS9-short,
3643 editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
3644 title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
3645 booktitle = {NIPS 9},
3646 year = {-1},
3647 publisher = {MIT Press}
3648 }
3649
3650
3651 @INPROCEEDINGS{nnlm:2001:nips,
3652 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
3653 title = {A Neural Probabilistic Language Model},
3654 year = {2001},
3655 crossref = {NIPS13-shorter},
3656 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words. This is intrinsically difficult because of the curse of dimensionality: we propose to fight it with its own weapons. In the proposed approach one learns simultaneously (1) a distributed representation for each word (i.e. a similarity between words) along with (2) the probability function for word sequences, expressed with these representations. Generalization is obtained because a sequence of words that
3657 has never been seen before gets high probability if it is made of words that are similar to words forming an already seen sentence. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model.},
3658 topics={Markov,Unsupervised,Language},cat={C},
3659 }
3660
3661 @INPROCEEDINGS{nsvn:2000:ijcnn,
3662 author = {Vincent, Pascal and Bengio, Yoshua},
3663 title = {A Neural Support Vector Network Architecture with Adaptive Kernels},
3664 booktitle = {International Joint Conference on Neural Networks 2000},
3665 volume = {V},
3666 year = {2000},
3667 pages = {187--192},
3668 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nsvn.pdf},
3669 abstract = {In the Support Vector Machines ({SVM}) framework, the positive-definite kernel can be seen as representing a fixed similarity measure between two patterns, and a discriminant function is obtained by taking a linear combination of the kernels computed at training examples called support vectors. Here we investigate learning architectures in which the kernel functions can be replaced by more general similarity measures that can have arbitrary internal parameters. The training criterion used in {SVM}s is not appropriate for this purpose so we adopt the simple criterion that is generally used when training neural networks for classification tasks. Several experiments are performed which show that such Neural Support Vector Networks perform similarly to {SVM}s while requiring significantly fewer support vectors, even when the similarity measure has no internal parameters.},
3670 topics={Kernel},cat={C},
3671 }
3672
3673 @INPROCEEDINGS{Ouimet+al-2005,
3674 author = {Ouimet, Marie and Bengio, Yoshua},
3675 editor = {Cowell, Robert G. and Ghahramani, Zoubin},
3676 title = {Greedy Spectral Embedding},
3677 booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
3678 year = {2005},
3679 pages = {253--260},
3680 publisher = {Society for Artificial Intelligence and Statistics},
3681 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/greedy-kernel-aistats05.pdf},
3682 abstract = {Spectral dimensionality reduction methods and spectral clustering methods require computation of the principal eigenvectors of an n X n matrix where n is the number of examples. Following up on previously proposed techniques to speed-up kernel methods by focusing on a subset of m examples, we study a greedy selection procedure for this subset, based on the feature space distance between a candidate example and the span of the previously chosen ones. In the case of kernel {PCA} or spectral clustering this reduces computation to O(m^2 n). For the same computational complexity, we can also compute the feature space projection of the non-selected examples on the subspace spanned by the selected examples, to estimate the embedding function based on all the data, which yields considerably better estimation of the embedding function. This algorithm can be formulated in an online setting and we can bound the error on the approximation of the Gram matrix.},
3683 topics={HighDimensional,kenel},cat={C},
3684 }
3685
3686 @MASTERSTHESIS{Ouimet-Msc-2004,
3687 author = {Ouimet, Marie},
3688 keywords = {algorithmes voraces., apprentissage non-supervis{\'{e}}, m{\'{e}}thodes spectrales, noyaux, r{\'{e}}duction de dimensionnalit{\'{e}}},
3689 title = {R{\'{e}}duction de dimensionnalit{\'{e}} non lin{\'{e}}aire et vorace},
3690 year = {2004},
3691 school = {Universit{\'{e}} de Montr{\'{e}}al},
3692 abstract = {Les m{\'{e}}thodes spectrales de r{\'{e}}duction de dimensionnalit{\'{e}} et les m{\'{e}}thodes de segmentation spectrale exigent le calcul des vecteurs propres principaux d'une matrice de taille n x n o{\`{u}} n est le nombre d'exemples. Des techniques ont {\'{e}}t{\'{e}} propos{\'{e}}es dans la litt{\'{e}}rature pour acc{\'{e}}l{\'{e}}rer les m{\'{e}}thodes {\`{a}} noyau en se concentrant sur un sous-ensemble de m exemples. Nous proposons une proc{\'{e}}dure vorace pour la s{\'{e}}lection de ce sous-ensemble, qui est bas{\'{e}}e sur la distance dans l'espace des caract{\`{e}}ristiques entre un exemple candidat et le sous-espace g{\'{e}}n{\'{e}}r{\'{e}} par les exemples pr{\'{e}}c{\'{e}}demment choisis. Dans le cas de l'ACP {\`{a}} noyau ou de la segmentation spectrale, nous obtenons un algorithme en O(m*m*n), o{\`{u}} m << n, qui, contrairement aux techniques pr{\'{e}}c{\'{e}}demment propos{\'{e}}es, peut se formuler de fa{\c c}on en-ligne. Pour la m{\^{e}}me complexit{\'{e}} en temps, nous pouvons {\'{e}}galement calculer la projection des exemples non choisis sur le sous-espace engendr{\'{e}} par les exemples choisis dans l'espace des caract{\'{e}}ristiques. En repr{\'{e}}sentant ainsi les exemples par leur projection nous obtenons une approximation de plus faible rang de la matrice de Gram sur toutes les donn{\'{e}}es. Nous pouvons {\'{e}}galement borner l'erreur correspondant {\`{a}} cette approximation de la matrice de Gram.}
3693 }
3694
3695 @ARTICLE{paiement+bengio+eck:aij,
3696 author = {Paiement, Jean-Fran{\c c}ois and Bengio, Samy and Eck, Douglas},
3697 title = {Probabilistic Models for Melodic Prediction},
3698 journal = {Artificial Intelligence Journal},
3699 volume = {173},
3700 year = {2009},
3701 pages = {1266-1274},
3702 source={OwnPublication},
3703 sourcetype={Journal},
3704 }
3705
3706 @INPROCEEDINGS{paiement+eck+bengio+barber:icml2005,
3707 author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy and Barber, D.},
3708 title = {A graphical model for chord progressions embedded in a psychoacoustic space},
3709 year = {2005},
3710 pages = {641--648},
3711 publisher = {ACM Press},
3712 crossref = {ICML05},
3713 source={OwnPublication},
3714 sourcetype={Conference},
3715 }
3716
3717 @INPROCEEDINGS{paiement+eck+bengio:ccai2006,
3718 author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
3719 editor = {Lamontagne, Luc and Marchand, Mario},
3720 title = {Probabilistic Melodic Harmonization},
3721 booktitle = {Canadian Conference on AI},
3722 series = {Lecture Notes in Computer Science},
3723 volume = {4013},
3724 year = {2006},
3725 pages = {218-229},
3726 publisher = {Springer},
3727 source={OwnPublication},
3728 sourcetype={Conference},
3729 }
3730
3731 @INPROCEEDINGS{paiement+eck+bengio:ismir2005,
3732 author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
3733 title = {A Probabilistic Model for Chord Progressions},
3734 booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
3735 year = {2005},
3736 pages = {312-319},
3737 source={OwnPublication},
3738 sourcetype={Conference},
3739 }
3740
3741 @INPROCEEDINGS{paiement+grandvalet+bengio+eck:icml2008,
3742 author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
3743 title = {A generative model for rhythms},
3744 year = {2008},
3745 pages = {},
3746 crossref = {ICML06-shorter},
3747 source={OwnPublication},
3748 sourcetype={Conference},
3749 }
3750
3751 @UNPUBLISHED{paiement+grandvalet+bengio+eck:nipsworkshop2007,
3752 author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
3753 title = {A generative model for rhythms},
3754 year = {2007},
3755 note = {NIPS 2007 Workshop on Music, Brain and Cognition},
3756 source={OwnPublication},
3757 sourcetype={Workshop},
3758 optkey={""},
3759 optmonth={""},
3760 optannote={""},
3761 }
3762
3763 @MASTERSTHESIS{Paiement-Msc-2003,
3764 author = {Paiement, Jean-Fran{\c c}ois},
3765 keywords = {algorithmes, apprentissage, apprentissage non supervis{\'{e}}, forage de donn{\'{e}}es, noyaux, r{\'{e}}duction de dimensions, statistique, Statistiques},
3766 title = {G{\'{e}}n{\'{e}}ralisation d'algorithmes de r{\'{e}}duction de dimension},
3767 year = {2003},
3768 school = {Universit{\'{e}} de Montr{\'{e}}al},
3769 abstract = {On pr{\'{e}}sente tout d'abord la notion de vari{\'{e}}t{\'{e}} comme r{\'{e}}gion de faible dimension contenant des observations situ{\'{e}}es dans un espace de haute dimension. Cette d{\'{e}}finition justifie l'{\'{e}}laboration d'algorithmes permettant d'exprimer les donn{\'{e}}es dans un syst{\`{e}}me de coordonn{\'{e}}es de dimensions {\'{e}}gale {\`{a}} celle de la vari{\'{e}}t{\'{e}} sur laquelle les donn{\'{e}}es sont approximativement situ{\'{e}}es.
3770 La notion de noyau comme mesure de similarit{\'{e}} est par la suite formalis{\'{e}}e. On constate que l'application d'un noyau {\`{a}} deux observations correspond {\`{a}} l'{\'{e}}valuation d'un produit scalaire dans un espace de Hilbert appel{\'{e}} espace de caract{\'{e}}ristiques.
3771 Une m{\'{e}}thode de r{\'{e}}duction de dimension lin{\'{e}}raire est expos{\'{e}}e ainsi que ces limites. Des algorithmes non lin{\'{e}}raires de r{\'{e}}duction de dimension et de segmentation permettent de s'affranchir de ces limites. Ces derniers ne fournissent cependant pas d'extension directe {\`{a}} des points hors {\'{e}}chantillon.
3772 L'{\'{e}}tape fondamentale au sein des algorithmes pr{\'{e}}sent{\'{e}}s est la solution d'un syst{\`{e}}me de vecteurs propres d'une matrice sym{\'{e}}trique cr{\'{e}}{\'{e}}e {\`{a}} partir d'un noyau d{\'{e}}pendant des donn{\'{e}}es. On con{\c c}oit cd probl{\`{e}}me comme le fait de trouver les fonctions propres d'un op{\'{e}}rateur lin{\'{e}}aire d{\'{e}}fini {\`{a}} partir du m{\^{e}}me noyau. On utilise alors la formulation de Nystr{\"{o}}m, pr{\'{e}}sente dans l'espace en composantes principales {\`{a}} noyaux, afin de r{\'{e}}duire la dimension des points hors {\'{e}}chantillon sur la vase des plongements obtenus {\`{a}} l'aide des algorithmes d{\'{e}}j{\`{a}} mentionn{\'{e}}s.
3773 La qualit{\'{e}} de la projection g{\'{e}}n{\'{e}}r{\'{e}}e est compar{\'{e}}e {\`{a}} la perturbation intrins{\`{e}}que des algorithmes si on substitue certaine observations par d'autres tir{\'{e}}es de la m{\^{e}}me distribution.}
3774 }
3775
3776 @ARTICLE{perez+gers+schmidhuber+eck:2002,
3777 author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
3778 title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
3779 journal = {Neural Networks},
3780 volume = {16},
3781 number = {2},
3782 year = {2003},
3783 abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
3784 source={OwnPublication},
3785 sourcetype={Journal},
3786 }
3787
3788 @ARTICLE{perez+gers+schmidhuber+eck:2003,
3789 author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
3790 title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
3791 journal = {Neural Networks},
3792 volume = {16},
3793 number = {2},
3794 year = {2003},
3795 pages = {241--250},
3796 abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
3797 source={OwnPublication},
3798 sourcetype={Journal},
3799 }
3800
3801 @INPROCEEDINGS{perez+schmidhuber+gers+eck:icannB2002,
3802 author = {Perez-Ortiz, J. A. and Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
3803 editor = {Dorronsoro, J.},
3804 title = {Improving Long-Term Online Prediction with {Decoupled Extended Kalman Filters}},
3805 booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
3806 year = {2002},
3807 pages = {1055--1060},
3808 publisher = {Springer},
3809 abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) outperform traditional {RNN}s when dealing with sequences involving not only short-term but also long-term dependencies. The decoupled extended Kalman filter learning algorithm ({DEKF}) works well in online environments and reduces significantly the number of training steps when compared to the standard gradient-descent algorithms. Previous work on {LSTM}, however, has always used a form of gradient descent and has not focused on true online situations. Here we combine {LSTM} with {DEKF} and show that this new hybrid improves upon the original learning algorithm when applied to online processing.},
3810 source={OwnPublication},
3811 sourcetype={Conference},
3812 }
3813
3814 @TECHREPORT{Pigeon-Bengio-96-aH-TR,
3815 author = {Pigeon, Steven and Bengio, Yoshua},
3816 title = {A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols},
3817 number = {\#1081},
3818 year = {1997},
3819 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3820 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/HuffAdapt.pdf},
3821 abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithm implicitly assumes that either we have a small alphabet — quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbol, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the number of observed symbols.},
3822 topics={Compression},cat={T},
3823 }
3824
3825 @INPROCEEDINGS{Pigeon-dcc98,
3826 author = {Pigeon, Steven and Bengio, Yoshua},
3827 editor = {Society, {IEEE} Computer},
3828 title = {A Memory-Efficient Adaptive Huffman Coding Algorithm for Very Large Sets of Symbols},
3829 booktitle = {Data Compression Conference},
3830 year = {1998},
3831 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc98.pdf},
3832 abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithms implicitly assumes that either we have a small alphabet — quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to
3833 encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbols, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the size of the alphabet.},
3834 topics={Compression},cat={C},
3835 }
3836
3837 @INPROCEEDINGS{Pigeon-dcc99,
3838 author = {Pigeon, Steven and Bengio, Yoshua},
3839 editor = {Society, {IEEE} Computer},
3840 title = {Binary Pseudowavelets and Applications to Bilevel Image Processing},
3841 booktitle = {Data Compression Conference},
3842 year = {1999},
3843 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc99.pdf},
3844 abstract = {This paper shows the existance of binary pseudowavelets, bases on the binary domain that exhibit some of the properties of wavelets, such as multiresolution reconstruction and compact support. The binary pseudowavelets are defined on _n (binary vectors of length n) and are operated upon with the binary operators logical and and exclusive or. The forward transform, or analysis, is the decomposition of a binary vector into its constituant binary pseudowavelets. Binary pseudowavelets allow multiresolution, progressive reconstruction of binary vectors by using progressively more coefficients in the inverse transform. Binary pseudowavelets bases, being sparse matrices, also provide for fast transforms; moreover pseudowavelets rely on hardware-friendly operations for efficient software and hardware implementation.},
3845 topics={Compression},cat={C},
3846 }
3847
3848 @TECHREPORT{Pigeon-Huffman-TR98,
3849 author = {Pigeon, Steven and Bengio, Yoshua},
3850 title = {A Memory-Efficient Adaptive Huffman Coding for Very Large Sets of Symbols revisited},
3851 number = {1095},
3852 year = {1998},
3853 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3854 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TechRep_AdaptativeHuffman2.pdf},
3855 abstract = {While algorithm M (presented in A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols, by Steven Pigeon & Yoshua Bengio, Universit{\'{e}} de Montr{\'{e}}al technical report #1081 [1]) converges to the entropy of the signal, it also assumes that the characteristics of the signal are stationary, that is, that they do not change over time and that successive adjustments, ever decreasing in their magnitude, will lead to a reasonable approximation of the entropy. While this is true for some data, it is clearly not true for some other. We present here a modification of the M algorithm that allows negative updates. Negative updates are used to maintain a window over the source. Symbols enter the window at its right and will leave it at its left, after w steps (the window width). The algorithm presented here allows us to update correctly the weights of the symbols in the symbol tree. Here, we will also have negative migration or demotion, while we only had positive migration or promotion in M. This algorithm will be called M+.},
3856 topics={Compression},cat={T},
3857 }
3858
3859 @PHDTHESIS{Pigeon-Phd-2001,
3860 author = {Pigeon, Steven},
3861 keywords = {algorithmes, codes adaptatifs, codes de Golomb, codes universels, Compression de donn{\'{e}}es, compression LZ78, LZW, ondelettes, pseudo-ondelettes},
3862 title = {Contributions {\`{a}} la compression de donn{\'{e}}es},
3863 year = {2001},
3864 school = {Universit{\'{e}} de Montr{\'{e}}al},
3865 abstract = {L'objectif de cette th{\`{e}}se est de pr{\'{e}}senter nos contributions {\`{a}} la compression de donn{\'{e}}es. Le texte entier n'est pas consacr{\'{e}} {\`{a}} nos seules contributions. Une large part est consacr{\'{e}}e au mat{\'{e}}riel introductif et {\`{a}} la recension de litt{\'{e}}rature sur les sujets qui sont pertinents {\`{a}} nos contributions. Le premier chapitre de contribution, le chapitre "Contribution au codage des entiers" se concentre sur le probl{\`{e}}me de la g{\'{e}}n{\'{e}}ration de codes efficaces pour les entiers. Le chapitre "Codage Huffman Adaptatif" pr{\'{e}}sente deux nouveaux algorithmes pour la g{\'{e}}n{\'{e}}ration dynamique de codes structur{\'{e}}s en arbre, c'est-{\`{a}}-dire des codes de type Huffman. Le chapitre "LZW avec une perte" explore le probl{\`{e}}me de la compression d'images comportant un petit nombre de couleurs distinctes et propose une extension avec perte d'un algorithme originalement sans perte, LZW. Enfin, le dernier chapitre de contribution, le chapitre "Les pseudo-ondelettes binaires" pr{\'{e}}sente une solution original au probl{\`{e}}me de l'analyse multir{\'{e}}solution des images monochromes, c'est-{\`{a}}-dire des images n'ayant que deux couleurs, conventionnellement noir et blanc. Ce type d'image correspond par exemple aux images textuelles telle que produites par un processus de transmission de type facsimil{\'{e}}.}
3866 }
3867
3868 @ARTICLE{Pigeon98,
3869 author = {Pigeon, Steven and Bengio, Yoshua},
3870 title = {Memory-Efficient Adaptive Huffman Coding},
3871 journal = {Dr. Dobb's Journal},
3872 volume = {290},
3873 year = {1998},
3874 pages = {131--135},
3875 topics={Compression},cat={J},
3876 }
3877
3878 @INPROCEEDINGS{probnn:2000:ijcnn,
3879 author = {Bengio, Yoshua},
3880 title = {Probabilistic Neural Network Models for Sequential Data},
3881 booktitle = {International Joint Conference on Neural Networks 2000},
3882 volume = {V},
3883 year = {2000},
3884 pages = {79--84},
3885 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/81_01.PDF},
3886 abstract = {It has already been shown how Artificial Neural Networks ({ANN}s) can be incorporated into probabilistic models.
3887 In this paper we review some of the approaches which have been proposed to incorporate them into probabilistic
3888 models of sequential data, such as Hidden {Markov} Models ({HMM}s). We also discuss new developments and new
3889 ideas in this area, in particular how {ANN}s can be used to model high-dimensional discrete and continuous data to
3890 deal with the curse of dimensionality, and how the ideas proposed in these models could be applied to statistical
3891 language modeling to represent longer-term context than allowed by trigram models, while keeping word-order
3892 information.},
3893 topics={Markov},cat={C},
3894 }
3895
3896 @UNPUBLISHED{pugin+burgoyne+eck+fujinaga:nipsworkshop2007,
3897 author = {Pugin, L. and Burgoyne, J. A. and Eck, Douglas and Fujinaga, I.},
3898 title = {Book-adaptive and book-dependant models to accelerate digitalization of early music},
3899 year = {2007},
3900 note = {NIPS 2007 Workshop on Music, Brain and Cognition},
3901 source={OwnPublication},
3902 sourcetype={Workshop},
3903 optkey={""},
3904 optmonth={""},
3905 optannote={""},
3906 }
3907
3908 @INPROCEEDINGS{Rahim-97,
3909 author = {Rahim, Mazin and Bengio, Yoshua and {LeCun}, Yann},
3910 title = {Discriminative feature and model design for automatic speech recognition},
3911 booktitle = {Proceedings of Eurospeech 1997},
3912 year = {1997},
3913 pages = {75--78},
3914 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rahim-bengio-lecun-97.ps.gz},
3915 abstract = {A system for discriminative feature and model design is presented for automatic speech recognition. Training based on minimum classification error with a single objective function is applied for designing a set of parallel networks performing feature transformation and a set of hidden {Markov} models performing speech recognition. This paper compares the use of linear and non-linear functional transformations when applied to conventional recognition features, such as spectrum or cepstrum. It also provides a framework for integrated feature and model training when using class-specific transformations. Experimental results on telephone-based connected digit recognition are presented.},
3916 topics={Speech},cat={C},
3917 }
3918
3919 @ARTICLE{Rivest-2009,
3920 author = {Rivest, Fran{\c c}ois and Kalaska, John and Bengio, Yoshua},
3921 title = {Alternative Time Representations in Dopamine Models},
3922 journal = {Journal of Computational Neuroscience},
3923 volume = {28},
3924 number = {1},
3925 year = {2009},
3926 pages = {107--130},
3927 abstract = {Dopaminergic neuron activity has been modeled during learning and appetitive behavior, most commonly using the temporal-difference (TD) algorithm. However, a proper representation of elapsed time and of the exact task is usually required for the model to work. Most models use timing elements such as delay-line representations of time that are not biologically realistic for intervals in the range of seconds. The interval-timing literature provides several alternatives. One of them is that timing could emerge from general network dynamics, instead of coming from a dedicated circuit. Here, we present a general rate-based learning model based on long short-term memory ({LSTM}) networks that learns a time representation when needed. Using a na{\"{\i}}ve network learning its environment in conjunction with TD, we reproduce dopamine activity in appetitive trace conditioning with a constant CS-US interval, including probe trials with unexpected delays. The proposed model learns a representation of the environment dynamics in an adaptive biologically plausible framework, without recourse to delay lines or other special-purpose circuits. Instead, the model predicts that the task-dependent representation of time is learned by experience, is encoded in ramp-like changes in single-neuron activity distributed across small neural networks, and reflects a temporal integration mechanism resulting from the inherent dynamics of recurrent loops within the network. The model also reproduces the known finding that trace conditioning is more difficult than delay conditioning and that the learned representation of the task can be highly dependent on the types of trials experienced during training. Finally, it suggests that the phasic dopaminergic signal could facilitate learning in the cortex.}
3928 }
3929
3930 @ARTICLE{schmidhuber+gers+eck:2002,
3931 author = {Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
3932 title = {Learning Nonregular Languages: A Comparison of Simple Recurrent Networks and {LSTM}},
3933 journal = {Neural Computation},
3934 volume = {14},
3935 number = {9},
3936 year = {2002},
3937 pages = {2039--2041},
3938 abstract = {In response to Rodriguez' recent article (Rodriguez 2001) we compare the performance of simple recurrent nets and {\em ``Long Short-Term Memory''} ({LSTM}) recurrent nets on context-free and context-sensitive languages.},
3939 source={OwnPublication},
3940 sourcetype={Journal},
3941 }
3942
3943 @TECHREPORT{Schwenk-Bengio-97-TR,
3944 author = {Schwenk, Holger and Bengio, Yoshua},
3945 title = {Adaptive Boosting of Neural Networks for Character Recognition},
3946 number = {\#1072},
3947 year = {1997},
3948 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
3949 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/AdaBoostTR.pdf},
3950 abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
3951 topics={Boosting,Speech},cat={T},
3952 }
3953
3954 @INPROCEEDINGS{Schwenk-nips10,
3955 author = {Schwenk, Holger and Bengio, Yoshua},
3956 title = {Training Methods for Adaptive Boosting of Neural Networks for Character Recognition},
3957 year = {1998},
3958 crossref = {NIPS10-shorter},
3959 abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
3960 topics={Boosting,Speech},cat={C},
3961 }
3962
3963 @ARTICLE{Schwenk2000,
3964 author = {Schwenk, Holger and Bengio, Yoshua},
3965 title = {Boosting Neural Networks},
3966 journal = {Neural Computation},
3967 volume = {12},
3968 number = {8},
3969 year = {2000},
3970 pages = {1869--1887},
3971 abstract = {“Boosting” is a general method for improving the performance of learning algorithms. A recently proposed boosting algorithm is AdaBoost. It has been applied with great success to several benchmark machine learning problems using mainly decision trees as base classifiers. In this paper we investigate whether AdaBoost also works as well with neural networks, and we discuss the advantages and drawbacks of di_erent versions of the AdaBoost algorithm. In particular, we compare training methods based on sampling the training set and weighting the cost function. The results suggest that random resampling of the training data is not the main explanation of the success of the improvements brought by AdaBoost. This is in contrast to Bagging which directly aims at reducing variance and for which random resampling is essential to obtain the reduction in generalization error. Our system achieves about 1.4\% error on a data set of online handwritten digits from more than 200 writers. A boosted multi-layer network achieved 1.5\% error on the UCI Letters and 8.1\% error on the UCI satellite data set, which is significantly better than boosted decision trees.},
3972 topics={Boosting},cat={J},
3973 }
3974
3975 @INPROCEEDINGS{secondorder:2001:nips,
3976 author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
3977 title = {Incorporating Second-Order Functional Knowledge for Better Option Pricing},
3978 year = {2001},
3979 crossref = {NIPS13-shorter},
3980 abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in two of its arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of continuous functions with these and other properties. We apply this new class of functions to the task of modeling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.},
3981 topics={Finance},cat={C},
3982 }
3983
3984 @ARTICLE{Sonnenburg+al-2007,
3985 author = {Sonnenburg, Soeren and et al. and Vincent, Pascal},
3986 title = {The Need for Open Source Software in Machine Learning.},
3987 year = {2007},
3988 note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
3989 journal = {Journal of Machine Learning Research},
3990 abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.
3991
3992 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
3993 }
3994
3995 @ARTICLE{Takeuchi-Bengio-Kanamori-2002,
3996 author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
3997 title = {Robust Regression with Asymmetric Heavy-Tail Noise Distributions},
3998 journal = {Neural Computation},
3999 volume = {14},
4000 number = {10},
4001 year = {2002},
4002 pages = {2469--2496},
4003 abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they down-weight the influence of so-called outliers. When the noise distribution is assymetric these methods yield biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regession that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust etimators of the regression), and to lern a few remaining parameters to combbine and correct these stimators, to unbiasedly minimize the average squared error. Theoritical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
4004 topics={Mining},cat={J},
4005 }
4006
4007 @ARTICLE{Thierry+al-2008,
4008 author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul},
4009 title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases},
4010 journal = {Journal of New Music Research},
4011 year = {2008},
4012 abstract = {Social tags are user-generated keywords associated with some resource on the Web. In the case of music, social tags have become an important component of "Web 2.0" recommender systems, allowing users to generate playlists based on use-dependent terms such as chill or jogging that have been applied to particular songs. In this paper, we propose a method for predicting these social tags directly from MP3 files. Using a set of 360 classifiers trained using the online ensemble learning algorithm FilterBoost, we map audio features onto social tags collected from the Web. The resulting automatic tags (or autotags) furnish information about music that is otherwise untagged or poorly tagged, allowing for insertion of previously unheard music into a social recommender. This avoids the “cold-start problem” common in such systems. Autotags can also be used to smooth the tag space from which similarities and
4013 recommendations are made by providing a set of comparable baseline tags for all tracks in a recommender system. Because the words we learn are the same as those used by people who label their music collections, it is easy to integrate our predictions into existing similarity and prediction methods based on web data.}
4014 }
4015
4016 @ARTICLE{Thivierge+al-2007,
4017 author = {Thivierge, J. -P. and Rivest, Fran{\c c}ois and Monchi, O},
4018 title = {Spiking Neurons, Dopamine, and Plasticity: Timing Is Everything, But Concentration Also Matters},
4019 journal = {Synapse},
4020 volume = {61},
4021 year = {2007},
4022 pages = {375-390},
4023 abstract = {While both dopamine (DA) fluctuations and spike-timing-dependent plasticity (STDP) are known to influence long-term corticostriatal plasticity, little attention has been devoted to the interaction between these two fundamental mechanisms. Here, a theoretical framework is proposed to account for experimental results specifying the role of presynaptic activation, postsynaptic activation, and concentrations of extracellular DA in synaptic plasticity. Our starting point was an explicitly-implemented multiplicative rule linking STDP to Michaelis-Menton equations that models the dynamics of extracellular DA fluctuations. This rule captures a wide range of results on conditions leading to long-term potentiation and depression in simulations that manipulate the frequency of induced corticostriatal stimulation and DA release. A well-documented biphasic function relating DA concentrations to synaptic plasticity emerges naturally from simulations involving a multiplicative rule linking DA and neural activity. This biphasic function is found consistently across different neural coding schemes employed (voltage-based vs. spike-based models). By comparison, an additive rule fails to capture these results. The proposed framework is the first to generate testable predictions on the dual influence of DA concentrations and STDP on long-term plasticity, suggesting a way in which the biphasic influence of DA concentrations can modulate the direction and magnitude of change induced by STDP, and raising the possibility that DA concentrations may inverse the LTP/LTD components of the STDP rule.}
4024 }
4025
4026 @TECHREPORT{tonga-tr,
4027 author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
4028 title = {Topmoumoute online natural gradient algorithm},
4029 number = {1299},
4030 year = {2007},
4031 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4032 abstract = {Guided by the goal of obtaining an optimization algorithm that is
4033 both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
4034 }
4035
4036 @TECHREPORT{TR1197,
4037 author = {Vincent, Pascal and Bengio, Yoshua},
4038 title = {K-Local Hyperplane and Convex Distance Nearest Neighbor Algorithms},
4039 number = {1197},
4040 year = {2001},
4041 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4042 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1197.pdf},
4043 abstract = {Guided by an initial idea of building a complex (non linear) decision surface with maximal local margin in input space, we give a possible geometrical intuition as to why K-Nearest Neighbor ({KNN}) algorithms often perform more poorly than {SVM}s on classification tasks. We then propose modified K-Nearest Neighbor algorithms to overcome the perceived problem. The approach is similar in spirit to Tangent Distance, but with invariances inferred from the local neighborhood rather than prior knowledge. Experimental results on real world classification tasks suggest that the modified {KNN} algorithms often give a dramatic improvement over standard {KNN} and perform as well or better than {SVM}s.},
4044 topics={Kernel},cat={T},
4045 }
4046
4047 @TECHREPORT{TR1198,
4048 author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
4049 title = {Robust Regression with Asymmetric Heavy-Tail Noise},
4050 number = {1198},
4051 year = {2001},
4052 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4053 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1198.pdf},
4054 abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they downweight the influence of so-called outliers. When the noise distribution is asymmetric these methods yield strongly biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regression that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust estimators of the regression), and to learn a few remaining parameters to combine and correct these estimators, to minimize the average squared error. Theoretical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
4055 topics={Mining},cat={T},
4056 }
4057
4058 @TECHREPORT{TR1199,
4059 author = {Chapados, Nicolas and Bengio, Yoshua and Vincent, Pascal and Ghosn, Joumana and Dugas, Charles and Takeuchi, Ichiro and Meng, Linyan},
4060 title = {Estimating Car Insurance Premia: a Case Study in High-Dimensional Data Inference},
4061 number = {1199},
4062 year = {2001},
4063 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4064 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1199.pdf},
4065 abstract = {Estimating insurance premia from data is a difficult regression problem for several reasons: the large number of variables, many of which are discrete, and the very peculiar shape of the noise distribution, asymmetric with fat tails, with a large majority zeros and a few unreliable and very large values. We introduce a methodology for estimating insurance premia that has been applied in the car insurance industry. It is based on mixtures of specialized neural networks, in order to reduce the effect of outliers on the estimation. Statistical comparisons with several different alternatives, including decision trees and generalized linear models show that the proposed method is significantly more precise, allowing to identify the least and most risky contracts, and reducing the median premium by charging more to the most risky customers.},
4066 topics={HighDimensional,Mining},cat={T},
4067 }
4068
4069 @TECHREPORT{TR1200,
4070 author = {Bengio, Yoshua and Chapados, Nicolas},
4071 title = {Extending Metric-Based Model Selection and Regularization in the Absence of Unlabeled Data},
4072 number = {1200},
4073 year = {2001},
4074 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4075 url = {http://www.iro.umontreal.ca/lisa/pointeurs/TR1200.ps},
4076 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over all the alternatives tried (including cross-validation). However, these methods require a large set of unlabeled data, which is not always available in many applications. In this paper we extend these methods (TRI, ADJ and ADA) to the case where no unlabeled data is available. The extended methods (xTRI, xADJ, xADA) use a model of the input density directly estimated from the training set. The intuition is that the main reason why the above methods work well is that they make sure that the learned function behaves similarly on the training points and on “neighboring” points. The experiments are based on estimating a simple non-parametric density model. They show that the extended methods perform comparably to the originals even though no unlabeled data is used.},
4077 topics={ModelSelection,Finance},cat={T},
4078 }
4079
4080 @TECHREPORT{TR1215,
4081 author = {Bengio, Yoshua},
4082 title = {New Distributed Probabilistic Language Models},
4083 number = {1215},
4084 year = {2002},
4085 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4086 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1215.ps},
4087 abstract = {Our previous work on statistical modeling introduced the use of probabilistic feedforward neural networks with shared parameters in order to help dealing with the curse of dimensionality. This work started with the motivation to speed up the above model and to take advantage of prior knowledge e.g., in WordNet or in syntactically labeled data sets, and to better deal with polysemy. With the objective of reaching these goals, we present here a series of new statistical language models, most of which are yet untested.},
4088 topics={Markov,Language,Unsupervised},cat={T},
4089 }
4090
4091 @TECHREPORT{TR1216,
4092 author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
4093 title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
4094 number = {1216},
4095 year = {2002},
4096 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4097 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1216.ps},
4098 abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we proposed and evaluate sampling-based methods which require network passes only for the observed “positive example” and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.},
4099 topics={Markov,Language,Unsupervised},cat={T},
4100 }
4101
4102 @TECHREPORT{TR1231,
4103 author = {Bengio, Yoshua and Kermorvant, Christopher},
4104 title = {Extracting Hidden Sense Probabilities from Bitexts},
4105 number = {1231},
4106 year = {2003},
4107 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4108 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1231.pdf},
4109 abstract = {We propose a probabilistic model that is inspired by Diab & Resnik’s algorithm to extract disambiguation information from aligned bilingual texts. Like Diab & Resnik’s, the proposed model uses WordNet and the fact that word ambiguities are not always the same in the two languages. The generative model introduces a dependency between two translated words through a common ancestor inWordNet’s ontology. Unlike Diab & Resnik’s algorithm it does not suppose that the translation in the source language has a single meaning.},
4110 topics={Language},cat={T},
4111 }
4112
4113 @TECHREPORT{TR1232,
4114 author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois},
4115 title = {Learning Eigenfunctions of Similarity: Linking Spectral Clustering and Kernel {PCA}},
4116 number = {1232},
4117 year = {2003},
4118 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4119 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1232.pdf},
4120 abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a Hilbert space whose inner product is defined with respect to a density model. This suggests a new approach to unsupervised learning in which abstractions (such as manifolds and clusters) that represent the main features of the data density are extracted. Abstractions discovered at one level can be used to build higher-level abstractions. This paper also discusses how these abstractions can be used to recover a quantitative model of the input density, which is at least useful for evaluative and comparative purposes.},
4121 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4122 }
4123
4124 @TECHREPORT{TR1234,
4125 author = {Bengio, Yoshua and Grandvalet, Yves},
4126 title = {No Unbiased Estimator of the Variance of K-Fold Cros-Validation},
4127 number = {1234},
4128 year = {2003},
4129 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4130 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1234.pdf},
4131 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important for them to also estimate the uncertainty around the error (or error difference) estimate. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make na{\"{\i}}ve estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
4132 topics={Comparative},cat={T},
4133 }
4134
4135 @TECHREPORT{tr1238,
4136 author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal},
4137 title = {Out-of-Sample Extensions for {LLE}, {I}somap, {MDS}, {E}igenmaps, and Spectral Clustering},
4138 number = {1238},
4139 year = {2003},
4140 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4141 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1238.pdf},
4142 abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides algorithms for such an extension for Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (all algorithms which provide lower-dimensional embedding for dimensionality reduction) as well as for Spectral Clustering (which performs non-Gaussian clustering). These extensions stem from a unified framework in which these algorithms are seen as learning eigenfunctions of a kernel. {LLE} and Isomap pose special challenges as the kernel is training-data dependent. Numerical experiments on real data show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms to the choice of training data.},
4143 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4144 }
4145
4146 @TECHREPORT{tr1239,
4147 author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois and Delalleau, Olivier and Ouimet, Marie and Le Roux, Nicolas},
4148 title = {Spectral Clustering and Kernel {PCA} are Learning Eigenfunctions},
4149 number = {1239},
4150 year = {2003},
4151 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4152 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1239.pdf},
4153 abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a function space whose scalar product is defined with respect to a density model. This defines a natural mapping for new data points, for methods that only provided an embedding, such as spectral clustering and Laplacian eigenmaps. The analysis hinges on a notion of generalization for embedding algorithms based on the estimation of underlying eigenfunctions, and suggests ways to improve this generalization by smoothing the data empirical distribution.},
4154 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4155 }
4156
4157 @TECHREPORT{tr1240,
4158 author = {Vincent, Pascal and Bengio, Yoshua},
4159 title = {Locally Weighted Full Covariance Gaussian Density Estimation},
4160 number = {1240},
4161 year = {2003},
4162 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4163 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1240.pdf},
4164 abstract = {We describe an interesting application of the principle of local learning to density estimation. Locally weighted fitting of a Gaussian with a regularized full covariance matrix yields a density estimator which displays improved behavior in the case where much of the probability mass is concentrated along a low dimensional manifold. While the proposed estimator is not guaranteed to integrate to 1 with a finite sample size, we prove asymptotic convergence to the true density. Experimental results illustrating the advantages of this estimator over classic non-parametric estimators are presented.},
4165 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4166 }
4167
4168 @TECHREPORT{tr1247,
4169 author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
4170 title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
4171 number = {1247},
4172 year = {2004},
4173 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4174 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1247.pdf},
4175 abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed non-parametric algorithms which provide an estimated continuous label for the given unlabeled examples. It extends them to function induction algorithms that correspond to the minimization of a regularization criterion applied to an out-of-sample example, and happens to have the form of a Parzen windows regressor. The advantage of the extension is that it allows predicting the label for a new example without having to solve again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Experiments show that the extension works well, in the sense of predicting a label close to the one that would have been obtained if the test example had been included in the unlabeled set. This relatively efficient function induction procedure can also be used when n is large to approximate the solution by writing it only in terms of a kernel expansion with m << n terms, and reducing the linear system to m equations in m unknowns.},
4176 topics={Kernel,Unsupervised},cat={T},
4177 }
4178
4179 @TECHREPORT{tr1250,
4180 author = {Bengio, Yoshua and Monperrus, Martin},
4181 title = {Discovering shared structure in manifold learning},
4182 number = {1250},
4183 year = {2004},
4184 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4185 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr-tangent.pdf},
4186 abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local will suffer from at least four generic problems associated with (1) noise in the data, (2) curvature of the manifold, (3) dimensionality of the manifold, and (4) the presence of many manifolds with little data per manifold. This analysis suggests non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented. The function has parameters that are shared across space rather than estimated based on the local neighborhood, as in current non-parametric manifold learning algorithms. The results show clearly the advantages of this approach with respect to local manifold learning algorithms.},
4187 topics={HighDimensional,Kernel,Unsupervised},cat={T},
4188 }
4189
4190 @TECHREPORT{tr1252,
4191 author = {Bengio, Yoshua and Larochelle, Hugo},
4192 title = {Implantation et analyse d'un mod{\`{e}}le graphique {\`{a}} entra{\^{\i}}nement supervis{\'{e}}, semi-supervis{\'{e}} et non-supervis{\'{e}} pour la d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique},
4193 number = {1252},
4194 year = {2004},
4195 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4196 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1252.pdf},
4197 abstract = {La d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique est un sujet qui suscite beaucoup d’int{\'{e}}r{\^{e}}t dans la communaut{\'{e}} scientifique en apprentissage automatique. Quoique cette t{\^{a}}che ait {\'{e}}t{\'{e}} abord{\'{e}}e depuis les d{\'{e}}buts du traitement automatique de la langue, peu de progr{\`{e}}s ont {\'{e}}t{\'{e}} accomplis jusqu’{\`{a}} maintenant. Nous pr{\'{e}}sentons ici une application de d{\'{e}}sambigu{\"{\i}}sation bas{\'{e}}e sur un mod{\`{e}}le graphique probabiliste. Ce mod{\`{e}}le a {\'{e}}t{\'{e}} appris sur des donn{\'{e}}es {\'{e}}tiquet{\'{e}}es, non-{\'{e}}tiquet{\'{e}}es, et sur la hi{\'{e}}rarchie WordNet. Avec peu d’examples d’apprentissage, ses performances sont comparables {\`{a}} celles de l’algorithme de Bayes na{\"{\i}}f. Il pourrait {\'{e}}ventuellement {\^{e}}tre adapt{\'{e}} {\`{a}} des corpus bi-textes.},
4198 topics={Unsupervised,Language},cat={T},
4199 }
4200
4201 @TECHREPORT{tr1281,
4202 author = {Le Roux, Nicolas and Bengio, Yoshua},
4203 title = {Continuous Neural Networks},
4204 number = {1281},
4205 year = {2006},
4206 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4207 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/continuous_nnet_tr1281.pdf},
4208 abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several
4209 ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based
4210 learning. While having the same number of parameters as an ordinary neural network, its internal
4211 structure suggests that it can represent some smooth functions much more compactly. Under mild
4212 assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this
4213 parametrization may help reducing the problem of saturation of the neurons. In a second approach, the
4214 input-to-hidden weights are fully non-parametric, yielding a kernel machine for which we demonstrate
4215 a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free
4216 and still generalizes in spite of an absence of explicit regularization.},
4217 cat={T},topics={Kernel,HighDimensional},
4218 }
4219
4220 @TECHREPORT{tr1282,
4221 author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
4222 title = {Greedy Layer-Wise Training of Deep Networks},
4223 number = {1282},
4224 year = {2006},
4225 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4226 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dbn_supervised_tr1282.pdf},
4227 abstract = {Deep multi-layer neural networks have many levels of non-linearities, which allows them to potentially
4228 represent very compactly highly non-linear and highly-varying functions. However, until recently it
4229 was not clear how to train such deep networks, since gradient-based optimization starting from random
4230 initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy
4231 layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with
4232 many layers of hidden causal variables. In the context of the above optimization problem, we study
4233 this algorithm empirically and explore variants to better understand its success and extend it to cases
4234 where the inputs are continuous or where the structure of the input distribution is not revealing enough
4235 about the variable to be predicted in a supervised task.},
4236 cat={T},topics={HighDimensional,Unsupervised},
4237 }
4238
4239 @TECHREPORT{tr1283,
4240 author = {Carreau, Julie and Bengio, Yoshua},
4241 title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tail Data},
4242 number = {1283},
4243 year = {2006},
4244 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4245 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/fat_tails_tr1283.pdf},
4246 abstract = {We propose an estimator for the conditional density p(Y |X) that can adapt for asymmetric heavy tails
4247 which might depend on X. Such estimators have important applications in finance and insurance. We
4248 draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter
4249 controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been
4250 replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain
4251 a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional
4252 density estimator, the parameters of the mixture estimator can be seen as functions of X and these
4253 functions learned. We show experimentally that this approach better models the conditional density in
4254 terms of likelihood than compared competing algorithms: conditional mixture models with other types
4255 of components and multivariate nonparametric models.},
4256 cat={T},topics={Unsupervised,Mining},
4257 }
4258
4259 @TECHREPORT{tr1284,
4260 author = {Larochelle, Hugo and Bengio, Yoshua},
4261 title = {Distributed Representation Prediction for Generalization to New Words},
4262 number = {1284},
4263 year = {2006},
4264 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4265 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dist_rep_pred_tr1284.pdf},
4266 abstract = {Learning distributed representations of symbols (e.g. words) has been used in several Natural Language Processing
4267 systems. Such representations can capture semantic or syntactic similarities between words, which permit to fight
4268 the curse of dimensionality when considering sequences of such words. Unfortunately, because these representations
4269 are learned only for a previously determined vocabulary of words, it is not clear how to obtain representations
4270 for new words. We present here an approach which gets around this problem by considering the distributed representations
4271 as predictions from low-level or domain-knowledge features of words. We report experiments on a Part
4272 Of Speech tagging task, which demonstrates the success of this approach in learning meaningful representations and
4273 in providing improved accuracy, especially for new words.},
4274 cat={T},topics={HighDimensional,Language},
4275 }
4276
4277 @TECHREPORT{tr1285,
4278 author = {Grandvalet, Yves and Bengio, Yoshua},
4279 title = {Hypothesis Testing for Cross-Validation},
4280 number = {1285},
4281 year = {2006},
4282 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4283 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/xv_rho_stat_tr1285.pdf},
4284 abstract = {K-fold cross-validation produces variable estimates, whose variance cannot be estimated unbiasedly. However, in practice, one would like to
4285 provide a figure related to the variability of this estimate. The first part
4286 of this paper lists a series of restrictive assumptions (on the distribution of
4287 cross-validation residuals) that allow to derive unbiased estimates. We exhibit three such estimates, corresponding to differing assumptions. Their
4288 similar expressions however entail almost identical empirical behaviors.
4289 Then, we look for a conservative test for detecting significant differences
4290 in performances between two algorithms. Our proposal is based on the
4291 derivation of the form of a t-statistic parametrized by the correlation of
4292 residuals between each validation set. Its calibration is compared to the
4293 usual t-test. While the latter is overconfident in concluding that differences are indeed significant, our test is bound to be more skeptical, with
4294 smaller type-I error.},
4295 cat={T},topics={ModelSelection,Comparative},
4296 }
4297
4298 @TECHREPORT{tr1286,
4299 author = {Erhan, Dumitru and Bengio, Yoshua and {L'Heureux}, Pierre-Jean and Yue, Shi Yi},
4300 title = {Generalizing to a Zero-Data Task: a Computational Chemistry Case Study},
4301 number = {1286},
4302 year = {2006},
4303 institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4304 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mt_qsar_tr1286.pdf},
4305 abstract = {We investigate the problem of learning several tasks simultaneously in order to transfer the acquired
4306 knowledge to a completely new task for which no training data are available. Assuming that the tasks
4307 share some representation that we can discover efficiently, such a scenario should lead to a better model of
4308 the new task, as compared to the model that is learned by only using the knowledge of the new task. We
4309 have evaluated several supervised learning algorithms in order to discover shared representations among
4310 the tasks defined in a computational chemistry/drug discovery problem. We have cast the problem from
4311 a statistical learning point of view and set up the general hypotheses that have to be tested in order
4312 to validate the multi-task learning approach. We have then evaluated the performance of the learning
4313 algorithms and showed that it is indeed possible to learn a shared representation of the tasks that allows
4314 to generalize to a new task for which no training data are available. From a theoretical point of view,
4315 our contribution also comprises a modification to the Support Vector Machine algorithm, which can
4316 produce state-of-the-art results using multi-task learning concepts at its core. From a practical point
4317 of view, our contribution is that this algorithm can be readily used by pharmaceutical companies for
4318 virtual screening campaigns.},
4319 cat={T},topics={MultiTask,Kernel,Bioinformatic},
4320 }
4321
4322 @INPROCEEDINGS{Turian+al-2009,
4323 author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua},
4324 title = {Quadratic Features and Deep Architectures for Chunking},
4325 booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)},
4326 year = {2009},
4327 abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
4328 }
4329
4330 @INPROCEEDINGS{Turian+al-2010,
4331 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua and Roth, Dan},
4332 title = {A preliminary evaluation of word representations for named-entity recognition},
4333 booktitle = {NIPS Workshop on Grammar Induction, Representation of Language and Language Learning},
4334 year = {2009},
4335 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/wordrepresentations-ner.pdf},
4336 abstract = {We use different word representations as word features for a named-entity recognition (NER) system with a linear model. This work is part of a larger empirical survey, evaluating different word representations on different NLP tasks. We evaluate Brown clusters, Collobert and Weston (2008) embeddings, and HLBL (Mnih & Hinton, 2009) embeddings of words. All three representations improve accuracy on NER, with the Brown clusters providing a larger improvement than the two embeddings, and the HLBL embeddings more than the Collobert and Weston (2008) embeddings. We also discuss some of the practical issues in using embeddings as features. Brown clusters are simpler than embeddings because they require less hyperparameter tuning.}
4337 }
4338
4339 @INPROCEEDINGS{Turian+Ratinov+Bengio-2010,
4340 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
4341 title = {Word representations: A simple and general method for semi-supervised learning},
4342 booktitle = {Association for Computational Linguistics(ACL2010)},
4343 year = {2010}
4344 }
4345
4346 @INPROCEEDINGS{Vincent-Bengio-2003,
4347 author = {Vincent, Pascal and Bengio, Yoshua},
4348 title = {Manifold Parzen Windows},
4349 year = {2003},
4350 pages = {825--832},
4351 crossref = {NIPS15-shorter},
4352 abstract = {The similarity between objects is a fundamental element of many learning algorithms. Most non-parametric methods take this similarity to be fixed, but much recent work has shown the advantages of learning it, in particular to exploit the local invariances in the data or to capture the possibly non-linear manifold on which most of the data lies. We propose a new non-parametric kernel density estimation method which captures the local structure of an underlying manifold through the leading eigenvectors of regularized local covariance matrices. Experiments in density estimation show significant improvements with respect to Parzen density estimators. The density estimators can also be used within Bayes classifiers, yielding classification rates similar to {SVM}s and much superior to the Parzen classifier.},
4353 topics={HighDimensional,Kernel,Unsupervised},cat={C},
4354 }
4355
4356 @TECHREPORT{Vincent-TR1316,
4357 author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
4358 title = {Extracting and Composing Robust Features with Denoising Autoencoders},
4359 number = {1316},
4360 year = {2008},
4361 institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4362 url = {http://www.iro.umontreal.ca/~vincentp/Publications/denoising_autoencoders_tr1316.pdf},
4363 abstract = {Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite.}
4364 }
4365
4366 @PHDTHESIS{Vincent2003,
4367 author = {Vincent, Pascal},
4368 title = {Mod{\`{e}}les {\`{a}} Noyaux {\`{a}} Structure Locale},
4369 year = {2003},
4370 school = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4371 }
4372
4373 @ARTICLE{vincent:2001,
4374 author = {Vincent, Pascal and Bengio, Yoshua},
4375 title = {Kernel Matching Pursuit},
4376 journal = {Machine Learning},
4377 year = {2001},
4378 abstract = {We show how Matching Pursuit can be used to build kernel-based solutions to machine-learning problems while keeping control of the sparsity of the solution, and how it can be extended to use non-squared error loss functions. We also deriveMDL motivated generalization bounds for this type of algorithm. Finally, links to boosting algorithms and {RBF} training procedures, as well as extensive experimental comparison with {SVM}s are given, showing comparable results with typically sparser models.},
4379 topics={HighDimensional,Kernel},cat={J},
4380 }
4381
4382 @INPROCEEDINGS{VincentPLarochelleH2008,
4383 author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
4384 title = {Extracting and Composing Robust Features with Denoising Autoencoders},
4385 year = {2008},
4386 pages = {1096--1103},
4387 crossref = {ICML08-shorter},
4388 abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
4389 for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
4390 this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
4391 RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
4392 approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
4393 }
4394
4395 @TECHREPORT{visualization_techreport,
4396 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
4397 title = {Visualizing Higher-Layer Features of a Deep Network},
4398 number = {1341},
4399 year = {2009},
4400 institution = {University of Montreal},
4401 abstract = {Deep architectures have demonstrated state-of-the-art results in a variety of
4402 settings, especially with vision datasets. Beyond the model definitions and the quantitative analyses, there is a need for qualitative comparisons of the solutions learned by various deep architectures. The goal of this paper is to find good qualitative interpretations of high level features represented by such models. To this end, we contrast and compare several techniques applied on Stacked Denoising Autoencoders and Deep Belief Networks, trained on several vision datasets. We show that, perhaps counter-intuitively, such interpretation is possible at the unit level, that it is simple to accomplish and that the results are consistent across various techniques. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work}
4403 }
4404
4405 @INPROCEEDINGS{xAISTATS2009-short,
4406 title = {Proc. AISTATS'2009},
4407 booktitle = {Proc. AISTATS'2009},
4408 year = {2009}
4409 }
4410
4411
4412 @MISC{Yoshua+al-snowbird-2008,
4413 author = {Bengio, Yoshua and Larochelle, Hugo and Turian, Joseph},
4414 title = {Deep Woods},
4415 year = {2008},
4416 howpublished = {Poster presented at the Learning@Snowbird Workshop, Snowbird, USA, 2008}
4417 }
4418
4419 @ARTICLE{Zaccaro-et-al-2005,
4420 author = {Zaccaro, Maria Clara and Boon, Hong and Pattarawarapan, Mookda and Xia, Zebin and Caron, Antoine and {L'Heureux}, Pierre-Jean and Bengio, Yoshua and Burgess, Kevin and Saragori, H. Uri},
4421 title = {Selective Small Molecule Peptidomimetic Ligands of TrkC and TrkA Receptors Afford Discrete or Complete Neurotrophic Activities},
4422 journal = {Chemistry \& Biology},
4423 volume = {12},
4424 number = {9},
4425 year = {2005},
4426 pages = {1015--1028}
4427 }
4428
4429
4430
4431 crossreferenced publications:
4432 @INPROCEEDINGS{ICML09,
4433 editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
4434 title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
4435 booktitle = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
4436 year = {-1},
4437 publisher = {ACM}
4438 }
4439
4440 @INPROCEEDINGS{NIPS7,
4441 editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
4442 title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
4443 booktitle = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
4444 year = {-1},
4445 publisher = {MIT Press}
4446 }
4447
4448 @INPROCEEDINGS{NIPS6,
4449 editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
4450 title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
4451 booktitle = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
4452 year = {-1},
4453 publisher = {MIT Press}
4454 }
4455
4456 @INPROCEEDINGS{NIPS8,
4457 editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
4458 title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
4459 booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
4460 year = {-1},
4461 publisher = {MIT Press}
4462 }
4463
4464 @INPROCEEDINGS{NIPS19,
4465 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
4466 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4467 booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4468 year = {-1},
4469 publisher = {MIT Press}
4470 }
4471
4472 @INPROCEEDINGS{NIPS10,
4473 editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
4474 title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
4475 booktitle = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
4476 year = {-1},
4477 publisher = {MIT Press}
4478 }
4479
4480 @INPROCEEDINGS{NIPS1,
4481 editor = {Touretzky, D. S.},
4482 title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
4483 booktitle = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
4484 year = {-1},
4485 publisher = {Morgan Kaufmann}
4486 }
4487
4488 @INPROCEEDINGS{NIPS2,
4489 editor = {Touretzky, D. S.},
4490 title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
4491 booktitle = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
4492 year = {-1},
4493 publisher = {Morgan Kaufmann}
4494 }
4495
4496 @INPROCEEDINGS{NIPS4,
4497 editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
4498 title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
4499 booktitle = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
4500 year = {-1},
4501 publisher = {Morgan Kaufmann}
4502 }
4503
4504 @INPROCEEDINGS{NIPS12,
4505 editor = {Solla, S.A. and Leen, T. K.},
4506 title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
4507 booktitle = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
4508 year = {-1},
4509 publisher = {MIT Press}
4510 }
4511
4512 @INPROCEEDINGS{NIPS16,
4513 editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
4514 title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
4515 booktitle = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
4516 year = {-1}
4517 }
4518
4519 @INPROCEEDINGS{NIPS22,
4520 editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
4521 title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
4522 booktitle = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
4523 year = {-1}
4524 }
4525
4526 @INPROCEEDINGS{NIPS20,
4527 editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
4528 title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
4529 booktitle = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
4530 year = {-1},
4531 publisher = {MIT Press}
4532 }
4533
4534 @INPROCEEDINGS{xAISTATS2009,
4535 title = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
4536 booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
4537 year = {2009},
4538 }
4539
4540 @INPROCEEDINGS{NIPS9,
4541 editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
4542 title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
4543 booktitle = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
4544 year = {-1},
4545 publisher = {MIT Press}
4546 }
4547
4548 @INPROCEEDINGS{NIPS17,
4549 editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
4550 title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
4551 booktitle = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
4552 year = {-1}
4553 }
4554
4555 @INPROCEEDINGS{ICML08,
4556 editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
4557 title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
4558 booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
4559 year = {-1},
4560 publisher = {ACM}
4561 }
4562
4563 @INPROCEEDINGS{ICML07,
4564 editor = {Ghahramani, Zoubin},
4565 title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
4566 booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
4567 year = {-1},
4568 publisher = {ACM}
4569 }
4570
4571 @TECHREPORT{DIRO,
4572 title = {DIRO},
4573 year = {-1},
4574 institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
4575 }
4576
4577 @INPROCEEDINGS{NIPS18,
4578 editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
4579 title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
4580 booktitle = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
4581 year = {-1},
4582 publisher = {MIT Press}
4583 }
4584
4585 @INPROCEEDINGS{NIPS13,
4586 editor = {Leen, T. K. and Dietterich, T.G.},
4587 title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
4588 booktitle = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
4589 year = {-1},
4590 publisher = {MIT Press}
4591 }
4592
4593 @INPROCEEDINGS{ICML05,
4594 editor = {Raedt, Luc De and Wrobel, Stefan},
4595 title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
4596 booktitle = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
4597 year = {-1},
4598 publisher = {ACM}
4599 }
4600
4601 @INPROCEEDINGS{ICML06,
4602 editor = {Cohen, William W. and Moore, Andrew},
4603 title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
4604 booktitle = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
4605 year = {-1},
4606 publisher = {ACM}
4607 }
4608
4609 @INPROCEEDINGS{NIPS15,
4610 editor = {Becker, S. and Thrun, Sebastian},
4611 title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
4612 booktitle = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
4613 year = {-1},
4614 publisher = {MIT Press}
4615 }
4616
4617 @INPROCEEDINGS{ICML01-shorter,
4618 title = {ICML'01},
4619 booktitle = {ICML'01},
4620 year = {-1},
4621 publisher = {Morgan Kaufmann}
4622 }
4623 @INPROCEEDINGS{ICML02-shorter,
4624 title = {ICML'02},
4625 booktitle = {ICML'02},
4626 year = {-1},
4627 publisher = {Morgan Kaufmann}
4628 }
4629 @INPROCEEDINGS{ICML03-shorter,
4630 title = {ICML'03},
4631 booktitle = {ICML'03},
4632 year = {-1},
4633 publisher = {AAAI Press}
4634 }
4635 @INPROCEEDINGS{ICML04-shorter,
4636 title = {ICML'04},
4637 booktitle = {ICML'04},
4638 year = {-1},
4639 publisher = {ACM}
4640 }
4641 @INPROCEEDINGS{ICML05-shorter,
4642 title = {ICML'05},
4643 booktitle = {ICML'05},
4644 year = {-1},
4645 publisher = {ACM}
4646 }
4647 @INPROCEEDINGS{ICML06-shorter,
4648 title = {ICML'06},
4649 booktitle = {ICML'06},
4650 year = {-1},
4651 publisher = {ACM}
4652 }
4653 @INPROCEEDINGS{ICML07-shorter,
4654 title = {ICML'07},
4655 booktitle = {ICML'07},
4656 year = {-1},
4657 publisher = {ACM}
4658 }
4659 @INPROCEEDINGS{ICML08-shorter,
4660 title = {ICML'08},
4661 booktitle = {ICML'08},
4662 year = {-1},
4663 publisher = {ACM}
4664 }
4665 @INPROCEEDINGS{ICML09-shorter,
4666 title = {ICML'09},
4667 booktitle = {ICML'09},
4668 year = {-1},
4669 publisher = {ACM}
4670 }
4671 @INPROCEEDINGS{ICML96-shorter,
4672 title = {ICML'96},
4673 booktitle = {ICML'96},
4674 year = {-1},
4675 publisher = {Morgan Kaufmann}
4676 }
4677 @INPROCEEDINGS{ICML97-shorter,
4678 title = {ICML'97},
4679 booktitle = {ICML'97},
4680 year = {-1},
4681 publisher = {Morgan Kaufmann}
4682 }
4683 @INPROCEEDINGS{ICML98-shorter,
4684 title = {ICML'98},
4685 booktitle = {ICML'98},
4686 year = {-1},
4687 publisher = {Morgan Kaufmann}
4688 }
4689 @INPROCEEDINGS{ICML99-shorter,
4690 title = {ICML'99},
4691 booktitle = {ICML'99},
4692 year = {-1},
4693 publisher = {Morgan Kaufmann}
4694 }
4695 @INPROCEEDINGS{NIPS1-shorter,
4696 title = {NIPS'88},
4697 booktitle = {NIPS 1},
4698 year = {-1},
4699 publisher = {Morgan Kaufmann}
4700 }
4701 @INPROCEEDINGS{NIPS10-shorter,
4702 title = {NIPS'97},
4703 booktitle = {NIPS 10},
4704 year = {-1},
4705 publisher = {MIT Press}
4706 }
4707 @INPROCEEDINGS{NIPS11-shorter,
4708 title = {NIPS'98},
4709 booktitle = {NIPS 11},
4710 year = {-1},
4711 publisher = {MIT Press}
4712 }
4713 @INPROCEEDINGS{NIPS12-shorter,
4714 title = {NIPS'99},
4715 booktitle = {NIPS 12},
4716 year = {-1},
4717 publisher = {MIT Press}
4718 }
4719 @INPROCEEDINGS{NIPS13-shorter,
4720 title = {NIPS'00},
4721 booktitle = {NIPS 13},
4722 year = {-1},
4723 publisher = {MIT Press}
4724 }
4725 @INPROCEEDINGS{NIPS14-shorter,
4726 title = {NIPS'01},
4727 booktitle = {NIPS 14},
4728 year = {-1},
4729 publisher = {MIT Press}
4730 }
4731 @INPROCEEDINGS{NIPS15-shorter,
4732 title = {NIPS'02},
4733 booktitle = {NIPS 15},
4734 year = {-1},
4735 publisher = {MIT Press}
4736 }
4737 @INPROCEEDINGS{NIPS16-shorter,
4738 title = {NIPS'03},
4739 booktitle = {NIPS 16},
4740 year = {-1}
4741 }
4742 @INPROCEEDINGS{NIPS17-shorter,
4743 title = {NIPS'04},
4744 booktitle = {NIPS 17},
4745 year = {-1}
4746 }
4747 @INPROCEEDINGS{NIPS18-shorter,
4748 title = {NIPS'05},
4749 booktitle = {NIPS 18},
4750 year = {-1},
4751 publisher = {MIT Press}
4752 }
4753 @INPROCEEDINGS{NIPS19-shorter,
4754 title = {NIPS'06},
4755 booktitle = {NIPS 19},
4756 year = {-1},
4757 publisher = {MIT Press}
4758 }
4759 @INPROCEEDINGS{NIPS2-shorter,
4760 title = {NIPS'89},
4761 booktitle = {NIPS 2},
4762 year = {-1},
4763 publisher = {Morgan Kaufmann}
4764 }
4765 @INPROCEEDINGS{NIPS20-shorter,
4766 title = {NIPS'07},
4767 booktitle = {NIPS 20},
4768 year = {-1},
4769 publisher = {MIT Press}
4770 }
4771 @INPROCEEDINGS{NIPS21-shorter,
4772 title = {NIPS'08},
4773 booktitle = {NIPS 21},
4774 year = {-1},
4775 publisher = {Nips Foundation (http://books.nips.cc)}
4776 }
4777 @INPROCEEDINGS{NIPS22-shorter,
4778 title = {NIPS'09},
4779 booktitle = {NIPS 22},
4780 year = {-1}
4781 }
4782 @INPROCEEDINGS{NIPS3-shorter,
4783 title = {NIPS'90},
4784 booktitle = {NIPS 3},
4785 year = {-1},
4786 publisher = {Morgan Kaufmann}
4787 }
4788 @INPROCEEDINGS{NIPS4-shorter,
4789 title = {NIPS'91},
4790 booktitle = {NIPS 4},
4791 year = {-1},
4792 publisher = {Morgan Kaufmann}
4793 }
4794 @INPROCEEDINGS{NIPS5-shorter,
4795 title = {NIPS'92},
4796 booktitle = {NIPS 5},
4797 year = {-1},
4798 publisher = {Morgan Kaufmann}
4799 }
4800 @INPROCEEDINGS{NIPS6-shorter,
4801 title = {NIPS'93},
4802 booktitle = {NIPS 6},
4803 year = {-1},
4804 publisher = {MIT Press}
4805 }
4806 @INPROCEEDINGS{NIPS7-shorter,
4807 title = {NIPS'94},
4808 booktitle = {NIPS 7},
4809 year = {-1},
4810 publisher = {MIT Press}
4811 }
4812 @INPROCEEDINGS{NIPS8-shorter,
4813 title = {NIPS'95},
4814 booktitle = {NIPS 8},
4815 year = {-1},
4816 publisher = {MIT Press}
4817 }
4818 @INPROCEEDINGS{NIPS9-shorter,
4819 title = {NIPS'96},
4820 booktitle = {NIPS 9},
4821 year = {-1},
4822 publisher = {MIT Press}
4823 }
4824 @INPROCEEDINGS{xAISTATS2009-shorter,
4825 title = {AISTATS'2009},
4826 booktitle = {AISTATS'2009},
4827 year = {-1}
4828 }
4829
4830