view writeup/aigaion-shorter.bib @ 644:e63d23c7c9fb

reviews aistats finales
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Thu, 24 Mar 2011 17:05:05 -0400
parents 51213beaed8b
children
line wrap: on
line source

%Aigaion2 BibTeX export from LISA - Publications
%Tuesday 02 November 2010 04:10:50 PM
@MASTERSTHESIS{,
    author = {Breuleux, Olivier},
     title = {{\'{E}}chantillonnage dynamique de champs markoviens},
      year = {2010},
    school = {Universit{\'{e}} de Montr{\'{e}}al}
}

@PHDTHESIS{,
    author = {Rivest, Fran{\c c}ois},
     title = {Mod{\`{e}}le informatique du coapprentissage des ganglions de la base et du cortex : L’apprentissage par renforcement et le d{\'{e}}veloppement de repr{\'{e}}sentations},
      year = {2009},
    school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d’informatique et de recherche op{\'{e}}rationnelle},
  abstract = {English follow:

Tout au long de la vie, le cerveau d{\'{e}}veloppe des repr{\'{e}}sentations de son
environnement permettant {\`{a}} l’individu d’en tirer meilleur profit. Comment ces
repr{\'{e}}sentations se d{\'{e}}veloppent-elles pendant la qu{\^{e}}te de r{\'{e}}compenses demeure un
myst{\`{e}}re. Il est raisonnable de penser que le cortex est le si{\`{e}}ge de ces repr{\'{e}}sentations
et que les ganglions de la base jouent un r{\^{o}}le important dans la maximisation des
r{\'{e}}compenses. En particulier, les neurones dopaminergiques semblent coder un signal
d’erreur de pr{\'{e}}diction de r{\'{e}}compense. Cette th{\`{e}}se {\'{e}}tudie le probl{\`{e}}me en construisant,
{\`{a}} l’aide de l’apprentissage machine, un mod{\`{e}}le informatique int{\'{e}}grant de nombreuses
{\'{e}}vidences neurologiques.
        Apr{\`{e}}s une introduction au cadre math{\'{e}}matique et {\`{a}} quelques algorithmes de
l’apprentissage machine, un survol de l’apprentissage en psychologie et en
neuroscience et une revue des mod{\`{e}}les de l’apprentissage dans les ganglions de la
base, la th{\`{e}}se comporte trois articles. Le premier montre qu’il est possible
d’apprendre {\`{a}} maximiser ses r{\'{e}}compenses tout en d{\'{e}}veloppant de meilleures
repr{\'{e}}sentations des entr{\'{e}}es. Le second article porte sur l'important probl{\`{e}}me toujours
non r{\'{e}}solu de la repr{\'{e}}sentation du temps. Il d{\'{e}}montre qu’une repr{\'{e}}sentation du temps
peut {\^{e}}tre acquise automatiquement dans un r{\'{e}}seau de neurones artificiels faisant
office de m{\'{e}}moire de travail. La repr{\'{e}}sentation d{\'{e}}velopp{\'{e}}e par le mod{\`{e}}le ressemble
beaucoup {\`{a}} l’activit{\'{e}} de neurones corticaux dans des t{\^{a}}ches similaires. De plus, le
mod{\`{e}}le montre que l’utilisation du signal d’erreur de r{\'{e}}compense peut acc{\'{e}}l{\'{e}}rer la
construction de ces repr{\'{e}}sentations temporelles. Finalement, il montre qu’une telle
repr{\'{e}}sentation acquise automatiquement dans le cortex peut fournir l’information
n{\'{e}}cessaire aux ganglions de la base pour expliquer le signal dopaminergique. Enfin,
le troisi{\`{e}}me article {\'{e}}value le pouvoir explicatif et pr{\'{e}}dictif du mod{\`{e}}le sur diff{\'{e}}rentes
situations comme la pr{\'{e}}sence ou l’absence d’un stimulus (conditionnement classique
ou de trace) pendant l’attente de la r{\'{e}}compense. En plus de faire des pr{\'{e}}dictions tr{\`{e}}s
int{\'{e}}ressantes en lien avec la litt{\'{e}}rature sur les intervalles de temps, l’article r{\'{e}}v{\`{e}}le
certaines lacunes du mod{\`{e}}le qui devront {\^{e}}tre am{\'{e}}lior{\'{e}}es.
       Bref, cette th{\`{e}}se {\'{e}}tend les mod{\`{e}}les actuels de l’apprentissage des ganglions de
la base et du syst{\`{e}}me dopaminergique au d{\'{e}}veloppement concurrent de
repr{\'{e}}sentations temporelles dans le cortex et aux interactions de ces deux structures.

        Throughout lifetime, the brain develops abstract representations of its
environment that allow the individual to maximize his benefits. How these
representations are developed while trying to acquire rewards remains a mystery. It is
reasonable to assume that these representations arise in the cortex and that the basal
ganglia are playing an important role in reward maximization. In particular,
dopaminergic neurons appear to code a reward prediction error signal. This thesis
studies the problem by constructing, using machine learning tools, a computational
model that incorporates a number of relevant neurophysiological findings.
        After an introduction to the machine learning framework and to some of its
algorithms, an overview of learning in psychology and neuroscience, and a review of
models of learning in the basal ganglia, the thesis comprises three papers. The first
article shows that it is possible to learn a better representation of the inputs while
learning to maximize reward. The second paper addresses the important and still
unresolved problem of the representation of time in the brain. The paper shows that a
time representation can be acquired automatically in an artificial neural network
acting like a working memory. The representation learned by the model closely
resembles the activity of cortical neurons in similar tasks. Moreover, the model shows
that the reward prediction error signal could accelerate the development of the
temporal representation. Finally, it shows that if such a learned representation exists
in the cortex, it could provide the necessary information to the basal ganglia to
explain the dopaminergic signal. The third article evaluates the explanatory and
predictive power of the model on the effects of differences in task conditions such as
the presence or absence of a stimulus (classical versus trace conditioning) while
waiting for the reward. Beyond making interesting predictions relevant to the timing
literature, the paper reveals some shortcomings of the model that will need to be
resolved.
       In summary, this thesis extends current models of reinforcement learning of
the basal ganglia and the dopaminergic system to the concurrent development of
representation in the cortex and to the interactions between these two regions.}
}

@MASTERSTHESIS{,
    author = {Wood, Sean},
     title = {Non-negative matrix decomposition approaches to frequency domain analysis of music audio signals},
      year = {2010},
    school = {Universit{\'{e}} de Montr{\'{e}}al}
}

@TECHREPORT{ARXIV-2010,
       author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Bengio, Yoshua and Bergeron, Arnaud and Boulanger-Lewandowski, Nicolas and Breuel, Thomas and Chherawala, Youssouf and Cisse, Moustapha and C{\^{o}}t{\'{e}}, Myriam and Erhan, Dumitru and Eustache, Jeremy and Glorot, Xavier and Muller, Xavier and Pannetier Lebeuf, Sylvain and Pascanu, Razvan and Rifai, Salah and Savard, Fran{\c c}ois and Sicard, Guillaume},
     keywords = {Computer Vision and Pattern Recognition, Learning, Neural and Evolutionary Computing},
        title = {Deep Self-Taught Learning for Handwritten Character Recognition},
       number = {1353},
         year = {2010},
  institution = {University of Montr{\'{e}}al},
     abstract = {Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition.}
}

@INPROCEEDINGS{Attardi+al-2009,
     author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
   keywords = {classifier, dependency parsing, natural language, parser, perceptron},
      title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron},
  booktitle = {Proceeding of Evalita 2009},
     series = {LNCS},
       year = {2009},
  publisher = {Springer},
   abstract = {Abstract. DeSR is a statistical transition-based dependency parser which learns from annotated corpora which actions to perform for building parse trees while scanning a sentence. We describe recent improvements to the parser, in particular stacked parsing, exploiting a beam search strategy and using a Multilayer Perceptron classifier. For the Evalita 2009 Dependency Parsing task DesR was configured to use a combination of stacked parsers. The stacked combination achieved the best accuracy scores in both the main and pilot subtasks. The contribution to the result of various choices is analyzed, in particular for taking advantage of the peculiar features of the TUT Treebank.}
}

@INPROCEEDINGS{Bengio+al-2009,
    author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
     title = {Curriculum Learning},
      year = {2009},
  crossref = {ICML09-shorter},
  abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved by using a particular curriculum, i.e., the selection and order of training examples. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
}

@TECHREPORT{Bengio+al-2009-TR,
       author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
        title = {Curriculum Learning},
       number = {1330},
         year = {2009},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
}

@MISC{Bengio+al-patent-2000,
        author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and {LeCun}, Yann},
         title = {Module for constructing trainable modular  network in which each module outputs and inputs data structured as a graph},
          year = {2000},
  howpublished = {U.S.  Patent 6,128,606, October 3}
}

@MISC{Bengio+al-patent-2001,
        author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and G. Howard, Paul},
         title = {Z-Coder : a fast adaptive binary arithmetic  coder},
          year = {2001},
  howpublished = {U.S. Patent 6,188,334, February 13, 2001, along with patents 6,225,925,  6,281,817, and 6,476,740}
}

@MISC{Bengio+al-patent-94,
        author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
         title = {Visitor Registration System Using Automatic Handwriting Recognition},
          year = {1994},
  howpublished = {Patent submitted in the U.S.A. in October 1994, submission number 1-16-18-1}
}

@INCOLLECTION{Bengio+al-spectral-2006,
     author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
     editor = {Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti},
      title = {Spectral Dimensionality Reduction},
  booktitle = {Feature Extraction, Foundations and Applications},
       year = {2006},
  publisher = {Springer},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/eigenfn_chapter.pdf},
   abstract = {In this chapter, we study and put under a common framework a number
of non-linear dimensionality reduction methods, such as Locally Linear Embedding,
Isomap, Laplacian eigenmaps and kernel {PCA}, which are based
on performing an eigen-decomposition (hence the name "spectral"). That
framework also includes classical methods such as {PCA} and metric multidimensional
scaling ({MDS}). It also includes the data transformation step used
in spectral clustering. We show that in all of these cases the learning algorithm
estimates the principal eigenfunctions of an operator that depends on
the unknown data density and on a kernel that is not necessarily positive
semi-definite. This helps to generalize some of these algorithms so as to predict
an embedding for out-of-sample examples without having to retrain the
model. It also makes it more transparent what these algorithm are minimizing
on the empirical data and gives a corresponding notion of generalization
error.},
cat={B},topics={HighDimensional,Kernel,Unsupervised},
}

@INCOLLECTION{Bengio+al-ssl-2006,
     author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
     editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
      title = {Label Propagation and Quadratic Criterion},
  booktitle = {Semi-Supervised Learning},
       year = {2006},
      pages = {193--216},
  publisher = {{MIT} Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_ssl.pdf},
   abstract = {Various graph-based algorithms for semi-supervised learning have been proposed in
the recent literature. They rely on the idea of building a graph whose nodes are
data points (labeled and unlabeled) and edges represent similarities between points.
Known labels are used to propagate information through the graph in order to label
all nodes. In this chapter, we show how these different algorithms can be cast into
a common framework where one minimizes a quadratic cost criterion whose closed-form solution is found by solving a linear system of size n (total number of data
points). The cost criterion naturally leads to an extension of such algorithms to
the inductive setting, where one obtains test samples one at a time: the derived
induction formula can be evaluated in O(n) time, which is much more efficient
than solving again exactly the linear system (which in general costs O(kn2) time
for a sparse graph where each data point has k neighbors). We also use this inductive
formula to show that when the similarity between points satisfies a locality property,
then the algorithms are plagued by the curse of dimensionality, with respect to the
dimensionality of an underlying manifold.},
cat={B},topics={Unsupervised},
}

@TECHREPORT{Bengio+al-treecurse-2007,
       author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
        title = {Decision Trees do not Generalize to New Variations},
       number = {1304},
         year = {2007},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf}
}

@INPROCEEDINGS{Bengio+Bengio96,
     author = {Bengio, Samy and Bengio, Yoshua},
     editor = {Xu, L.},
      title = {An {EM} Algorithm for Asynchronous Input/Output Hidden {M}arkov Models},
  booktitle = {International Conference On Neural Information Processing},
       year = {1996},
      pages = {328--334},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iconip96.pdf},
   abstract = {In learning tasks in which input sequences are mapped to output sequences, it is often the case that the input and output sequences are not synchronous. For example, in speech recognition, acoustic sequences are longer than phoneme sequences. Input/Output Hidden {Markov} Models have already been proposed to represent the distribution of an output sequence given an input sequence of the same length. We extend here this model to the case of asynchronous sequences_ and show an Expectation-Maximization algorithm for training such models.},
topics={Markov},cat={C},
}

@INCOLLECTION{Bengio+chapter2007,
     author = {Bengio, Yoshua and {LeCun}, Yann},
     editor = {Bottou, {L{\'{e}}on} and Chapelle, Olivier and DeCoste, D. and Weston, J.},
      title = {Scaling Learning Algorithms towards {AI}},
  booktitle = {Large Scale Kernel Machines},
       year = {2007},
  publisher = {MIT Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+lecun_chapter2007.pdf},
   abstract = {One long-term goal of machine learning research is to produce methods that
are applicable to highly complex tasks, such as perception (vision, audition), reasoning,
intelligent control, and other artificially intelligent behaviors. We argue
that in order to progress toward this goal, the Machine Learning community must
endeavor to discover algorithms that can learn highly complex functions, with minimal
need for prior knowledge, and with minimal human intervention. We present
mathematical and empirical evidence suggesting that many popular approaches
to non-parametric learning, particularly kernel methods, are fundamentally limited
in their ability to learn complex high-dimensional functions. Our analysis
focuses on two problems. First, kernel machines are shallow architectures, in
which one large layer of simple template matchers is followed by a single layer
of trainable coefficients. We argue that shallow architectures can be very inefficient
in terms of required number of computational elements and examples. Second,
we analyze a limitation of kernel machines with a local kernel, linked to the
curse of dimensionality, that applies to supervised, unsupervised (manifold learning)
and semi-supervised kernel machines. Using empirical results on invariant
image recognition tasks, kernel methods are compared with deep architectures, in
which lower-level features or concepts are progressively combined into more abstract
and higher-level representations. We argue that deep architectures have the
potential to generalize in non-local ways, i.e., beyond immediate neighbors, and
that this is crucial in order to make progress on the kind of complex tasks required
for artificial intelligence.},
cat={B},topics={HighDimensional},
}

@ARTICLE{Bengio+Delalleau-2009,
    author = {Bengio, Yoshua and Delalleau, Olivier},
     title = {Justifying and Generalizing Contrastive Divergence},
   journal = {Neural Computation},
    volume = {21},
    number = {6},
      year = {2009},
     pages = {1601--1621},
  abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its residual term converges to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence (CD) estimator of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators.  The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain. We present theoretical and empirical evidence linking the number of Gibbs steps $k$ and the magnitude of the RBM parameters to the bias in the CD estimator. These experiments also suggest that the sign of the CD estimator is correct most of the time, even when the bias is large, so that CD-$k$ is a good descent direction even for small $k$.}
}

@TECHREPORT{Bengio+Delalleau-TR2007,
       author = {Bengio, Yoshua and Delalleau, Olivier},
     keywords = {Contrastive Divergence, Restricted {Boltzmann} Machine},
        title = {Justifying and Generalizing Contrastive Divergence},
       number = {1311},
         year = {2007},
  institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its terms converge to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence approximation of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain.}
}

@INPROCEEDINGS{Bengio+DeMori88,
     author = {Bengio, Yoshua and De Mori, Renato},
      title = {Use of neural networks for the recognition of place of articulation},
  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
       year = {1988},
      pages = {103--106},
topics={Speech},cat={C},
}

@INPROCEEDINGS{Bengio+DeMori89,
     author = {Bengio, Yoshua and Cardin, Regis and Cosi, Piero and De Mori, Renato},
      title = {Speech coding with multi-layer networks},
  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
       year = {1989},
      pages = {164--167},
topics={Speech},cat={C},
}

@INCOLLECTION{Bengio+DeMori90a,
     author = {Bengio, Yoshua and De Mori, Renato},
     editor = {Sethi, I. K. and Jain, A. K.},
      title = {Connectionist models and their application to automatic speech recognition},
  booktitle = {Artificial Neural Networks and Statistical Pattern Recognition: Old and New Connections},
       year = {1990},
      pages = {175--192},
  publisher = {Elsevier, Machine Intelligence and Pattern Recognition Series},
topics={Speech},cat={B},
}

@ARTICLE{Bengio+Frasconi-jair95,
    author = {Bengio, Yoshua and Frasconi, Paolo},
     title = {Diffusion of Context and Credit Information in {M}arkovian Models},
   journal = {Journal of Artificial Intelligence Research},
    volume = {3},
      year = {1995},
     pages = {249--270},
  abstract = {This paper studies the problem of ergodicity of transition probability matrices in {Markovian} models, such as hidden {Markov} models ({HMM}s), and how it makes very difficult the task of learning to represent long-term context for sequential data. This phenomenon hurts the forward propagation of long-term context information, as well as learning a hidden state representation to represent long-term context, which depends on propagating credit information backwards in time. Using results from {Markov} chain theory, we show that this problem of diffusion of context and credit is reduced when the transition probabilities approach 0 or 1, i.e., the transition probability matrices are sparse and the model essentially deterministic. The results found in this paper apply to learning approaches based on continuous optimization, such as gradient descent and the Baum-Welch algorithm.},
topics={Markov,LongTerm},cat={J},
}

@INPROCEEDINGS{Bengio+Frasconi-nips7-diffuse,
    author = {Bengio, Yoshua and Frasconi, Paolo},
     title = {Diffusion of Credit in {M}arkovian Models},
      year = {1995},
     pages = {553--560},
  crossref = {NIPS7-shorter},
  abstract = {This paper studies the problem of diffusion in {Markovian} models, such as hidden {Markov} models ({HMM}s) and how it makes very difficult the task of learning of long-term dependencies in sequences. Using results from {Markov} chain theory, we show that the problem of diffusion is reduced if the transition probabilities approach 0 or 1. Under this condition, standard {HMM}s have very limited modeling capabilities, but input/output {HMM}s can still perform interesting computations.},
topics={Markov},cat={C},
}

@INPROCEEDINGS{Bengio+Frasconi-nips7-iohmms,
    author = {Bengio, Yoshua and Frasconi, Paolo},
     title = {An Input/Output {HMM} Architecture},
      year = {1995},
     pages = {427--434},
  crossref = {NIPS7-shorter},
  abstract = {We introduce a recurrent architecture having a modular structure and we formulate a training procedure based on the {EM} algorithm. The resulting model has similarities to hidden {Markov} models, but supports recurrent networks processing style and allows to exploit the supervised learning paradigm while using maximum likelihood estimation.},
topics={Markov},cat={C},
}

@INPROCEEDINGS{Bengio+Frasconi-nips94,
    author = {Bengio, Yoshua and Frasconi, Paolo},
     title = {Credit Assignment through Time: Alternatives to Backpropagation},
      year = {1994},
     pages = {75--82},
  crossref = {NIPS6-shorter},
  abstract = {Learning to recognize or predict sequences using long-term context has many applications. However, practical and theoretical problems are found in training recurrent neural networks to perform tasks in which input/output dependencies span long intervals. Starting from a mathematical analysis of the problem, we consider and compare alternative algorithms and architectures on tasks for which the span of the input/output dependencies can be controlled. Results on the new algorithms show performance qualitatively superior to that obtained with backpropagation.},
topics={LongTerm},cat={C},
}

@ARTICLE{Bengio+Pouliot90,
    author = {Bengio, Yoshua and Pouliot, Yannick},
     title = {Efficient recognition of immunoglobulin domains from amino-acid sequences using a neural network},
   journal = {Computer Applications in the Biosciences},
    volume = {6},
    number = {2},
      year = {1990},
     pages = {319--324},
topics={Bioinformatic,PriorKnowledge},cat={J},
}

@INPROCEEDINGS{Bengio+Senecal-2003,
     author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
      title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
  booktitle = {Proceedings of the conference on Artificial Intelligence and Statistics (AISTATS)},
       year = {2003},
   abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary.  Inspired by the contrastive divergence model, we propose and evaluate sampling-based methods which require network passes only for the observed "positive example'' and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.}
}

@ARTICLE{Bengio+Senecal-2008,
    author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
  keywords = {Energy-based models, fast training, importance sampling, language modeling, Monte Carlo methods, probabilistic neural networks},
     title = {Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model},
   journal = {IEEE Transactions on Neural Networks},
    volume = {19},
    number = {4},
      year = {2008},
     pages = {713--722},
  abstract = {Previous work on statistical language modeling has shown that it is possible to train a feedforward neural network to approximate probabilities over sequences of words, resulting in significant error reduction when compared to standard baseline models based on -grams. However, training the neural network model with the maximum-likelihood criterion requires computations proportional to the number of words in the vocabulary. In this paper, we introduce adaptive importance sampling as a way to accelerate training of the model. The idea is to use an adaptive n-gram model to track the conditional distributions produced by the neural network. We show that a very significant speedup can be obtained on standard problems.}
}

@INCOLLECTION{Bengio-2007,
     author = {Bengio, Yoshua},
     editor = {Cisek, Paul and Kalaska, John and Drew, Trevor},
      title = {On the Challenge of Learning Complex Functions},
  booktitle = {Computational Neuroscience: Theoretical Insights into Brain Function},
     series = {Progress in Brain Research},
       year = {2007},
  publisher = {Elsevier},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/PBR_chapter.pdf},
   abstract = {A common goal of computational neuroscience and of artificial intelligence
research based on statistical learning algorithms is the discovery and
understanding of computational principles that could explain what we
consider adaptive intelligence, in animals as well as in machines. This
chapter focuses on what is required for the learning of complex behaviors. We
believe it involves the learning of highly varying functions, in a
mathematical sense. We bring forward two types of arguments which convey
the message that many currently popular machine learning approaches to
learning flexible functions have fundamental limitations that render them 
inappropriate for learning highly varying functions. The first issue
concerns the representation of such functions with what we call shallow model
architectures.  We discuss limitations of shallow architectures, such as
so-called kernel machines, boosting algorithms, and one-hidden-layer artificial neural
networks.  The second issue is more focused and concerns kernel machines
with a local kernel (the type used most often in practice),
that act like a collection of template matching units. We present
mathematical results on such computational architectures showing that they
have a limitation similar to those already proved for older non-parametric
methods, and connected to the so-called curse of dimensionality. Though it has long
been believed that efficient learning in deep architectures is difficult,
recently proposed computational principles for learning in deep architectures
may offer a breakthrough.}
}

@ARTICLE{Bengio-2009,
    author = {Bengio, Yoshua},
     title = {Learning deep architectures for {AI}},
   journal = {Foundations and Trends in Machine Learning},
    volume = {2},
    number = {1},
      year = {2009},
     pages = {1--127},
      note = {Also published as a book. Now Publishers, 2009.},
  abstract = {Theoretical results suggest that in order to learn the kind of
complicated functions that can represent high-level abstractions (e.g. in
vision, language, and other AI-level tasks), one may need {\insist deep
architectures}. Deep architectures are composed of multiple levels of non-linear
operations, such as in neural nets with many hidden layers or in complicated
propositional formulae re-using many sub-formulae. Searching the
parameter space of deep architectures is a difficult task, but
learning algorithms such as those for Deep Belief Networks have recently been proposed
to tackle this problem with notable success, beating the state-of-the-art
in certain areas. This paper discusses the motivations and principles regarding 
learning algorithms for deep architectures,  in particular those exploiting as
building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines,
used to construct deeper models such as Deep Belief Networks.}
}

@TECHREPORT{Bengio-96-TR,
       author = {Bengio, Yoshua},
        title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
       number = {\#1019},
         year = {1996},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengioy_TR1019.pdf},
     abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a rediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest. Experiments were performed on portfolio selection with 35 Canadian stocks.},
topics={Finance,Discriminant},cat={T},
}

@BOOK{bengio-book96,
     author = {Bengio, Yoshua},
      title = {Neural Networks for Speech and Sequence Recognition},
       year = {1996},
  publisher = {International Thompson Computer Press},
topics={Speech},cat={B},
}

@TECHREPORT{Bengio-convex-05,
       author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
        title = {Convex neural networks},
       number = {1263},
         year = {2005},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1263.pdf},
     abstract = {Convexity has recently received a lot of attention in the machine learning community, and the lack of convexity has been seen as a major disadvantage of many learning algorithms, such as multi-layer artificial neural networks. We how that training multi-layer neural networks in which the number of hidden units is learned can be viewed as a convex optimization problem. This problem involves an infinite number of variables, but can be solved by incrementally inserting a hidden unit at a time, each time finding a linear classifiers that minimizes a weighted sum of errors.},
topics={Boosting},cat={T},
}

@ARTICLE{Bengio-decision-trees10,
    author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
     title = {Decision Trees do not Generalize to New Variations},
   journal = {Computational Intelligence},
      year = {2010},
      note = {To appear}
}

@ARTICLE{bengio-demori89,
    author = {Bengio, Yoshua and De Mori, Renato},
     title = {Use of multilayer networks for the recognition of phonetic features and phonemes},
   journal = {Computational Intelligence},
    volume = {5},
      year = {1989},
     pages = {134--141},
topics={Speech},cat={J},
}

@ARTICLE{Bengio-eigen-NC2004,
    author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
     title = {Learning eigenfunctions links spectral embedding and kernel {PCA}},
   journal = {Neural Computation},
    volume = {16},
    number = {10},
      year = {2004},
     pages = {2197--2219},
  abstract = {In this paper, we show a direct relation between spectral embedding methods and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of an operator defined from a kernel and the unknown data generating density. Whereas spectral embedding methods only provided coordinates for the training points, the analysis justifies a simple extension to out-of-sample examples (the Nystr{\"{o}}m formula) for Multi-Dimensional Scaling, spectral clustering, Laplacian eigenmaps, Locally Linear Embedding ({LLE}) and Isomap. The analysis provides, for all such spectral embedding methods, the definition of a loss function, whose empirical average is minimized by the traditional algorithms. The asymptotic expected value of that loss defines a generalization performance and clarifies what these algorithms are trying to learn. Experiments with {LLE}, Isomap, spectral clustering and {MDS} show that this out-of-sample embedding formula generalizes well, with a level of error comparable to the effect of small perturbations of the training set on the embedding.},
topics={HighDimensional,Kernel,Unsupervised},cat={J},
}

@INPROCEEDINGS{Bengio-Gingras-nips8,
    author = {Bengio, Yoshua and Gingras, Fran{\c c}ois},
     title = {Recurrent Neural Networks for Missing or Asynchronous Data},
      year = {1996},
     pages = {395--401},
  crossref = {NIPS8-shorter},
  abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more discriminant approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},
topics={Finance,Missing},cat={C},
}

@ARTICLE{Bengio-Grandvalet-JMLR-04,
    author = {Bengio, Yoshua and Grandvalet, Yves},
     title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
    volume = {5},
      year = {2004},
     pages = {1089--1105},
  crossref = {JMLR-shorter},
  abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
topics={Comparative},cat={J},
}

@TECHREPORT{bengio-hyper-TR99,
       author = {Bengio, Yoshua},
        title = {Continuous Optimization of Hyper-Parameters},
       number = {1144},
         year = {1999},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyperTR.pdf},
     abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves (1) “training errors” on each training example and (2) some hyper-parameters, which are kept fixed during this minimization.	 When there is only a single hyper-parameter one can easily explore how its value aects a model selection criterion (that is not the same as the training criterion, and is used to select hyper-parameters).	In this paper we present a methodology to select many hyper-parameters that is based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. We first consider the case of a training criterion that is quadratic in the parameters. In that case, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient, but this formula requires the computation of second derivatives of the training criterion},
topics={ModelSelection},cat={T},
}

@INPROCEEDINGS{Bengio-icnn93,
     author = {Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice},
      title = {The problem of learning long-term dependencies in recurrent networks},
  booktitle = {IEEE International Conference on Neural Networks},
       year = {1993},
      pages = {1183--1195},
  publisher = {IEEE Press},
       note = {(invited paper)},
topics={LongTerm},cat={C},
}

@ARTICLE{Bengio-ijprai93,
    author = {Bengio, Yoshua},
     title = {A Connectionist Approach to Speech Recognition},
   journal = {International Journal on Pattern Recognition and Artificial Intelligence},
    volume = {7},
    number = {4},
      year = {1993},
     pages = {647--668},
  abstract = {The task discussed in this paper is that of learning to map input sequences to output sequences. In particular, problems of phoneme recognition in continuous speech are considered, but most of the discussed techniques could be applied to other tasks, such as the recognition of sequences of handwritten characters. The systems considered in this paper are based on connectionist models, or artificial neural networks, sometimes combined with statistical techniques for recognition of sequences of patterns, stressing the integration of prior knowledge and learning. Different architectures for sequence and speech recognition are reviewed, including recurrent networks as well as hybrid systems involving hidden {Markov} models.},
topics={PriorKnowledge,Speech},cat={J},
}

@TECHREPORT{Bengio-iohmms-TR99,
       author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
        title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
       number = {1146},
         year = {1999},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iohmms-returnsTR.pdf},
     abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditionned on an input sequence. For example, these conditional distributions can be linear, logistic, or non-linear (using for example multi-layer neural networks). We compare the generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments are performed on modeling the returns of market and sector indices. Note that the unconditional Gaussian estimates the first moment with the historical average. The results show that, although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s yielded significantly better performance, as measured by the out-of-sample likelihood.},
topics={Markov},cat={T},
}

@ARTICLE{bengio-lauzon-ducharme:2000,
    author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
     title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
   journal = {IEEE Transaction on Neural Networks},
    volume = {12},
    number = {1},
      year = {2001},
     pages = {113--123},
  abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditioned on an input sequence. For example, these conditional distributions can be logistic, or non-linear (using for example multi-layer neural networks).  We compare generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments compare these models on predicting the conditional density of returns of market sector indices. Note that the unconditional Gaussian estimates the first moment the historical average. The results show that_ although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s significantly better performance, as estimated by the out-of-sample likelihood.},
topics={Markov,Finance},cat={J},
}

@INPROCEEDINGS{bengio-lecun-94,
     author = {Bengio, Yoshua and {LeCun}, Yann},
      title = {Word normalization for on-line handwritten word recognition},
  booktitle = {Proc. of the International Conference on Pattern Recognition},
     volume = {II},
       year = {1994},
      pages = {409--413},
  publisher = {IEEE},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-norm.ps},
   abstract = {We introduce a new approach to normalizing words written with an electronic stylus that applies to all styles of handwriting (upper case, lower case, printed, cursive, or mixed). A geometrical model of the word spatial structure is fitted to the pen trajectory using the {EM} algorithm. The fitting process maximizes the likelihood of the trajectory given the model and a set a priors on its parameters. The method was evaluated and integrated to a recognition system that combines neural networks and hidden {Markov} models.},
topics={PriorKnowledge,Speech},cat={C},
}

@TECHREPORT{Bengio-localfailure-TR-2005,
       author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
        title = {The Curse of Dimensionality for Local Kernel Machines},
       number = {1258},
         year = {2005},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1258.pdf},
     abstract = {We present a series of theoretical arguments supporting the claim that a large class of modern learning algorithms based on local kernels are sensitive to the curse of dimensionality. These include local manifold learning algorithms such as Isomap and {LLE}, support vector classifiers with Gaussian or other local kernels, and graph-based semisupervised learning algorithms using a local similarity function. These algorithms are shown to be local in the sense that crucial properties of the learned function at x depend mostly on the neighbors of x in the training set. This makes them sensitive to the curse of dimensionality, well studied for classical non-parametric statistical learning. There
is a large class of data distributions for which non-local solutions could be expressed compactly and potentially be learned with few examples, but which will require a large number of local bases and therefore a large number of training examples when using a local learning algorithm.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@INPROCEEDINGS{Bengio-nips-2006,
    author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
     title = {Greedy Layer-Wise Training of Deep Networks},
      year = {2007},
     pages = {153--160},
  crossref = {NIPS19-shorter},
  abstract = {Complexity theory of circuits strongly suggests that deep architectures can be
much more efficient (sometimes exponentially) than shallow architectures,
in terms of computational elements required to represent some functions.
Deep multi-layer neural networks have many levels of non-linearities
allowing them to compactly represent highly non-linear and
highly-varying functions. However, until recently it was not clear how
to train such deep networks, since gradient-based
optimization starting from random initialization appears to often get stuck
in poor solutions. Hinton et al. recently introduced
a greedy layer-wise unsupervised learning algorithm for Deep Belief
Networks (DBN), a generative model with many layers of hidden causal
variables. In the context of the above optimization problem,
we study this algorithm empirically and explore variants to
better understand its success and extend it to cases where the inputs are
continuous or where the structure of the input distribution is not
revealing enough about the variable to be predicted in a supervised task.
Our experiments also confirm the hypothesis that the greedy
layer-wise unsupervised training strategy mostly helps the
optimization, by initializing weights in a region near a
good local minimum, giving rise to internal distributed representations
that are high-level abstractions of the input, bringing better generalization.}
}

@INPROCEEDINGS{Bengio-nips10,
    author = {Bengio, Yoshua and Bengio, Samy and Isabelle, Jean-Fran{\c c}ois and Singer, Yoram},
     title = {Shared Context Probabilistic Transducers},
      year = {1998},
  crossref = {NIPS10-shorter},
  abstract = {Recently, a model for supervised learning of probabilistic transducers represented by suffix trees was introduced. However, this algorithm tends to build very large trees, requiring very large amounts of computer memory. In this paper, we propose a new, more compact, transducer model in which one shares the parameters of distributions associated to contexts yielding similar conditional output distributions. We illustrate the advantages of the proposed algorithm with comparative experiments on inducing a noun phrase recognizer.},
topics={HighDimensional},cat={C},
}

@TECHREPORT{Bengio-NLMP-TR-2005,
       author = {Bengio, Yoshua and Larochelle, Hugo},
        title = {Non-Local Manifold Parzen Windows},
       number = {1264},
         year = {2005},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/NLMP-techreport.pdf},
     abstract = {In order to escape from the curse of dimensionality, we claim that one can learn non-local functions, in the sense that the value and shape of the learned function at x must be inferred using examples that may be far from x. With this objective, we present a non-local non-parametric density estimator. It builds upon previously proposed Gaussian mixture models with regularized covariance matrices to take into account the local shape of the manifold. It also builds upon recent work on non-local estimators of the tangent plane of a manifold, which are able to generalize in places with little training data, unlike traditional, local, non-parametric models.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@INPROCEEDINGS{Bengio-nncm96,
     author = {Bengio, Yoshua},
     editor = {Weigend, A.S. and Abu-Mostafa, Y.S. and Refenes, A. -P. N.},
      title = {Training A Neural Network with a Financial Criterion Rather than a Prediction Criterion},
  booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets ({NNCM}-96)},
       year = {1997},
      pages = {433--443},
  publisher = {World Scientific},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nncm.pdf},
   abstract = {A common approach to quantitative decision taking with financial time-series is to train a model using a prediction criterion (e.g., squared error). We find on a portfolio selection problem that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest, with a differentiable decision module.},
topics={Finance,PriorKnowledge,Discriminant},cat={C},
}

@TECHREPORT{Bengio-NonStat-Hyper-TR,
       author = {Bengio, Yoshua and Dugas, Charles},
        title = {Learning Simple Non-Stationarities with Hyper-Parameters},
       number = {1145},
         year = {1999},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nonstatTR.pdf},
     abstract = {We consider sequential data that is sampled from an unknown process, so that the data are not necessarily i.i.d.. Most approaches to machine learning assume that data points are i.i.d.. Instead we consider a measure of generalization that does not make this assumption, and we consider in this context a recently proposed approach to optimizing hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to hyper-parameters. Here we use hyper-parameters that control a function that gives different weights to different time steps in the historical data sequence. The approach is successfully applied to modeling thev olatility of stock returns one month ahead. Comparative experiments with more traditional methods are presented.},
topics={ModelSelection,Finance},cat={T},
}

@ARTICLE{Bengio-scholarpedia-2007,
    author = {Bengio, Yoshua},
     title = {Neural net language models},
   journal = {Scholarpedia},
    volume = {3},
    number = {1},
      year = {2008},
     pages = {3881},
  abstract = {A language model is a function, or an algorithm for learning such a function, that captures the salient statistical characteristics of the distribution of sequences of words in a natural language, typically allowing one to make probabilistic predictions of the next word given preceding ones.

A neural network language model is a language model based on Neural Networks , exploiting their ability to learn distributed representations to reduce the impact of the curse of dimensionality.

In the context of learning algorithms, the curse of dimensionality refers to the need for huge numbers of training examples when learning highly complex functions. When the number of input variables increases, the number of required examples can grow exponentially. The curse of dimensionality arises when a huge number of different combinations of values of the input variables must be discriminated from each other, and the learning algorithm needs at least one example per relevant combination of values. In the context of language models, the problem comes from the huge number of possible sequences of words, e.g., with a sequence of 10 words taken from a vocabulary of 100,000 there are 10^{50} possible sequences...

A distributed representation of a symbol is a tuple (or vector) of features which characterize the meaning of the symbol, and are not mutually exclusive. If a human were to choose the features of a word, he might pick grammatical features like gender or plurality, as well as semantic features like animate" or invisible. With a neural network language model, one relies on the learning algorithm to discover these features, and the features are continuous-valued (making the optimization problem involved in learning much simpler).

The basic idea is to learn to associate each word in the dictionary with a continuous-valued vector representation. Each word corresponds to a point in a feature space. One can imagine that each dimension of that space corresponds to a semantic or grammatical characteristic of words. The hope is that functionally similar words get to be closer to each other in that space, at least along some directions. A sequence of words can thus be transformed into a sequence of these learned feature vectors. The neural network learns to map that sequence of feature vectors to a prediction of interest, such as the probability distribution over the next word in the sequence. What pushes the learned word features to correspond to a form of semantic and grammatical similarity is that when two words are functionally similar, they can be replaced by one another in the same context, helping the neural network to compactly represent a function that makes good predictions on the training set, the set of word sequences used to train the model.

The advantage of this distributed representation approach is that it allows the model to generalize well to sequences that are not in the set of training word sequences, but that are similar in terms of their features, i.e., their distributed representation. Because neural networks tend to map nearby inputs to nearby outputs, the predictions corresponding to word sequences with similar features are mapped to similar predictions. Because many different combinations of feature values are possible, a very large set of possible meanings can be represented compactly, allowing a model with a comparatively small number of parameters to fit a large training set.}
}

@TECHREPORT{Bengio-TR1312,
       author = {Bengio, Yoshua},
        title = {Learning deep architectures for AI},
       number = {1312},
         year = {2007},
  institution = {Dept. IRO, Universite de Montreal},
         note = {Preliminary version of journal article with the same title appearing in Foundations and Trends in Machine Learning (2009)},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1312.pdf},
     abstract = {Theoretical results strongly suggest that in order to learn the kind of
complicated functions that can represent high-level abstractions (e.g. in
vision, language, and other AI-level tasks), one may need deep
architectures. Deep architectures are composed of multiple levels of non-linear
operations, such as in neural nets with many hidden layers. Searching the
parameter space of deep architectures is a difficult optimization task, but
learning algorithms such as those for Deep Belief Networks have recently been proposed
to tackle this problem with notable success, beating the state-of-the-art
in certain areas. This paper discusses the motivations and principles regarding 
learning algorithms for deep architectures and in particular for those based
on unsupervised learning such as Deep Belief Networks, using as building
blocks single-layer models such as Restricted {Boltzmann} Machines.}
}

@ARTICLE{Bengio-trnn94,
    author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
     title = {Learning Long-Term Dependencies with Gradient Descent is Difficult},
   journal = {IEEE Transactions on Neural Networks},
    volume = {5},
    number = {2},
      year = {1994},
     pages = {157--166},
  abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J},
}

@INPROCEEDINGS{Bengio-wirn93,
     author = {Bengio, Yoshua and Frasconi, Paolo and Gori, Marco and Soda, G.},
     editor = {Caianello, E.},
      title = {Recurrent Neural Networks for Adaptive Temporal Processing},
  booktitle = {Proc. of the 6th Italian Workshop on Neural Networks, WIRN-93},
       year = {1993},
      pages = {1183--1195},
  publisher = {World Scientific Publ.},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rnn_review93.ps},
topics={LongTerm},cat={C},
}

@ARTICLE{Bengio2000c,
    author = {Bengio, Yoshua},
     title = {Gradient-Based Optimization of Hyperparameters},
   journal = {Neural Computation},
    volume = {12},
    number = {8},
      year = {2000},
     pages = {1889--1900},
  abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
topics={ModelSelection},cat={J},
}

@ARTICLE{Bengio89a,
    author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Merlo, Ettore},
     title = {Programmable execution of multi-layered networks for automatic speech recognition},
   journal = {Communications of the Association for Computing Machinery},
    volume = {32},
    number = {2},
      year = {1989},
     pages = {195--199},
topics={Speech},cat={J},
}

@INPROCEEDINGS{Bengio89c,
    author = {Bengio, Yoshua and Cosi, Piero and Cardin, Regis and De Mori, Renato},
     title = {Use of multi-layered networks for coding speech with phonetic features},
      year = {1989},
     pages = {224--231},
  crossref = {NIPS1-shorter},
  abstract = {Preliminary results on speaker-independant speech recognition are reported. A method that combines expertise on neural networks with expertise on speech recognition is used to build the recognition systems. For transient sounds, event-driven property extractors with variable resolution in the time and frequency domains are used. For sonorant speech, a model of the human auditory system is preferred to FFT as a front-end module.},
topics={Speech},cat={C},
}

@INPROCEEDINGS{Bengio89d,
     author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
      title = {On the generalization capability of multilayered networks in the extraction of speech properties},
  booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence},
       year = {1989},
      pages = {1531--1536},
  publisher = {IEEE},
topics={Speech},cat={C},
}

@INPROCEEDINGS{Bengio90,
    author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato},
     title = {Speaker Independent Speech Recognition with Neural Networks and Speech Knowledge},
      year = {1990},
     pages = {218--225},
  crossref = {NIPS2-shorter},
  abstract = {We attempt to combine neural networks with knowledge from speech science to build a speaker independent speech recognition system. This knowledge is utilized in designing the preprocessing, input coding, output coding, output supervision and architectural constraints. To handle the temporal aspect of speech we combine delays, copies of activations of hidden and output units at the input level, and Back-Propagation for Sequences (BPS), a learning algorithm for networks with local self-loops. This strategy is demonstrated in several experiments, in particular a nasal discrimination task for which the application of a speech theory hypothesis dramatically improved generalization.},
topics={PriorKnowledge,Speech},cat={C},
}

@INCOLLECTION{Bengio90b,
     author = {Bengio, Yoshua},
      title = {Radial Basis Functions for speech recognition},
  booktitle = {Speech Recognition and Understanding: Recent Advances, Trends and Applications},
       year = {1990},
      pages = {293--298},
  publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
topics={Kernel,Speech},cat={B},
}

@INCOLLECTION{Bengio90c,
     author = {Bengio, Yoshua and De Mori, Renato},
     editor = {{Fogelman Soulie}, F. and Herault, J.},
      title = {Speech coding with multilayer networks},
  booktitle = {Neurocomputing: Algorithms, Architectures and Applications},
       year = {1990},
      pages = {207--216},
  publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
topics={Speech},cat={B},
}

@INPROCEEDINGS{Bengio90e,
    author = {Bengio, Yoshua and Pouliot, Yannick and Bengio, Samy and Agin, Patrick},
     title = {A neural network to detect homologies in proteins},
      year = {1990},
     pages = {423--430},
  crossref = {NIPS2-shorter},
  abstract = {In order to detect the presence and location of immunoglobulin (Ig) domains from amino acid sequences we built a system based on a neural network with one hidden layer trained with back propagation. The program was designed to efficiently identify proteins exhibiting such domains, characterized by a few localized conserved regions and a low overall homology. When the National Biomedical Research Foundation (NBRF) NEW protein sequence database was scanned to evaluate the program's performance, we obtained very low rates of false negatives coupled with a moderate rate of false positives.},
topics={Bioinformatic,PriorKnowledge},cat={C},
}

@INPROCEEDINGS{Bengio90z,
     author = {Bengio, Yoshua and De Mori, Renato and Gori, Marco},
     editor = {Caianello, E.},
      title = {Experiments on automatic speech recognition using BPS},
  booktitle = {Parallel Architectures and Neural Networks},
       year = {1990},
      pages = {223--232},
  publisher = {World Scientific Publ.},
topics={Speech},cat={C},
}

@INPROCEEDINGS{Bengio91a,
     author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
      title = {A comparative study of hybrid acoustic phonetic decoders based on artificial neural networks},
  booktitle = {Proceedings of EuroSpeech'91},
       year = {1991},
topics={PriorKnowledge,Speech},cat={C},
}

@INPROCEEDINGS{Bengio91b,
     author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
      title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
  booktitle = {Proceedings of EuroSpeech'91},
       year = {1991},
topics={Markov},cat={C},
}

@INPROCEEDINGS{Bengio91z,
     author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
      title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
  booktitle = {Proceedings of EuroSpeech'91},
       year = {1991},
cat={C},
}

@ARTICLE{Bengio92b,
    author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
     title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
   journal = {Speech Communication},
    volume = {11},
    number = {2--3},
      year = {1992},
     pages = {261--271},
      note = {Special issue on neurospeech},
topics={PriorKnowledge,Speech},cat={J},
}

@INPROCEEDINGS{Bengio92c,
    author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
     title = {Neural Network - Gaussian Mixture Hybrid for Speech Recognition or Density Estimation},
      year = {1992},
     pages = {175--182},
  crossref = {NIPS4-shorter},
  abstract = {The subject of this paper is the integration of multi-layered Artificial Neural Networks ({ANN}) with probability density functions such as Gaussian mixtures found in continuous density hlidden {Markov} Models ({HMM}). In the first part of this paper we present an {ANN}/HMM hybrid in which all the parameters or the the system are simultaneously optimized with respect to a single criterion. In the second part of this paper, we study the relationship between the density of the inputs of the network and the density of the outputs of the networks. A rew experiments are presented to explore how to perform density estimation with {ANN}s.},
topics={Speech},cat={C},
}

@INPROCEEDINGS{Bengio94d,
     author = {Frasconi, Paolo and Bengio, Yoshua},
      title = {An {EM} Approach to Grammatical Inference: Input/Output {HMMs}},
  booktitle = {International Conference on Pattern Recognition (ICPR'94)},
       year = {1994},
      pages = {289--294},
topics={Markov,LongTerm},cat={C},
}

@ARTICLE{Bengio96,
    author = {Bengio, Yoshua and Frasconi, Paolo},
     title = {Input/{O}utput {HMM}s for Sequence Processing},
   journal = {IEEE Transactions on Neural Networks},
    volume = {7},
    number = {5},
      year = {1996},
     pages = {1231--1249},
  abstract = {We consider problems of sequence processing and propose a solution based on a discrete state model in order to represent past context. We introduce a recurrent connectionist architecture having a modular structure that associates a subnetwork to each state. The model has a statistical interpretation we call Input/Output Hidden {Markov} Model ({IOHMM}). It can be trained by the {EM} or {GEM} algorithms, considering state trajectories as missing data, which decouples temporal credit assignment and actual parameter estimation.
The model presents similarities to hidden {Markov} models ({HMM}s), but allows us to map input sequences to output sequences, using the same processing style as recurrent neural networks. {IOHMM}s are trained using a more discriminant learning paradigm than {HMM}s, while potentially taking advantage of the {EM} algorithm.
We demonstrate that {IOHMM}s are well suited for solving grammatical inference problems on a benchmark problem. Experimental results are presented for the seven Tomita grammars, showing that these adaptive models can attain excellent generalization.},
topics={Markov},cat={J},
}

@TECHREPORT{Bengio96-hmmsTR,
       author = {Bengio, Yoshua},
        title = {Markovian Models for Sequential Data},
       number = {1049},
         year = {1996},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hmmsTR.pdf},
     abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many applications, especially for speech recognition. We first summarize the basics of {HMM}s, and then review several recent related learning algorithms and extensions of {HMM}s, including hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s, weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this area.},
topics={Markov},cat={T},
}

@ARTICLE{Bengio97,
    author = {Bengio, Yoshua},
     title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
   journal = {International Journal of Neural Systems},
    volume = {8},
    number = {4},
      year = {1997},
     pages = {433--443},
      note = {Special issue on noisy time-series},
  abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a prediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to maximize the financial criterion of interest, here gains and losses (including those due to transactions) incurred during trading. Experiments were performed on portfolio selection with 35 Canadian stocks.},
topics={Finance,PriorKnowledge,Discriminant},cat={J},
}

@ARTICLE{Bengio99a,
    author = {Bengio, Yoshua},
     title = {Markovian Models for Sequential Data},
   journal = {Neural Computing Surveys},
    volume = {2},
      year = {1999},
     pages = {129--162},
  abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many machine learning applications, especially for speech recognition. Furthermore? in the last few years, many new and promising probabilistic models related to {HMM}s have been proposed. We first summarize the basics of {HMM}s, arid then review several recent related learning algorithms and extensions of {HMM}s, including in particular hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s (which are conditional {HMM}s using neural networks to compute probabilities), weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this very active area.},
topics={Markov},cat={J},
}

@ARTICLE{Bengio99b,
    author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
     title = {Stochastic Learning of Strategic Equilibria for Auctions},
   journal = {Neural Computation},
    volume = {11},
    number = {5},
      year = {1999},
     pages = {1199--1209},
  abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
topics={Auction},cat={J},
}

@TECHREPORT{bengio:1990,
       author = {Bengio, Yoshua},
        title = {Learning a Synaptic Learning Rule},
       number = {751},
         year = {1990},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
topics={BioRules},cat={T},
}

@INPROCEEDINGS{bengio:1990:snowbird,
     author = {Bengio, Yoshua and R., De Mori},
      title = {Recurrent networks with Radial Basis Functions for speech recognition},
  booktitle = {1990 Neural Networks for Computing Conference},
       year = {1990},
topics={Speech},cat={C},
}

@INPROCEEDINGS{bengio:1991:ijcnn,
     author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
      title = {Learning a Synaptic Learning Rule},
  booktitle = {Proceedings of the International Joint Conference on Neural Networks},
       year = {1991},
      pages = {II--A969},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1991_ijcnn.ps},
   abstract = {This paper presents an original approach to neural modeling based on the idea of searching, with learning methods, for a synaptic learning rule which is biologically plausible, and yields networks that are able to learn to perform difficult tasks. The proposed method of automatically finding the learning rule relies on the idea of considering the synaptic modification rule as a parametric function. This function has local inputs and is the same in many neurons. The parameters that define this function can be estimated with known learning methods. For this optimization, we give particular attention to gradient descent and genetic algorithms. In both cases, estimation of this function consists of a joint global optimization of (a) the synaptic modification function, and (b) the networks that are learning to perform some tasks. The proposed methodology can be used as a tool to explore the missing pieces of the puzzle of neural networks learning. Both network architecture, and the learning function can be designed within constraints derived from biological knowledge.},
addressfr={Seattle, USA},topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1991:nnc,
     author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
      title = {Learning Synaptic Learning Rules},
  booktitle = {Neural Networks for Computing},
       year = {1991},
addressfr={Snowbird, Utah, USA},topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1991:snowbird,
     author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
      title = {Learning a Synaptic Learning Rule},
  booktitle = {1991 Neural Networks for Computing Conference},
       year = {1991},
topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1992:nn,
     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
      title = {Aspects th{\'{e}}oriques de l'optimisation d'une r{\`{e}}gle d'apprentissage},
  booktitle = {Actes de la conf{\'{e}}rence Neuro-N{\^{\i}}mes 1992},
       year = {1992},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_nn.ps},
   abstract = {Ayant expos{\'{e}} dans de pr{\'{e}}c{\'{e}}dentes publications (voir [Beng90, Beng92] notamment) l’id{\'{e}}e que l’on pouvait optimiser des r{\`{e}}gles d’apprentissage param{\'{e}}triques pour r{\'{e}}seaux de neurones, nous montrons dans cet article comment d{\'{e}}velopper, par la m{\'{e}}thode du Lagrangien, le gradient n{\'{e}}cessaire {\`{a}} l’optimisation d’une r{\`{e}}gle d’apprentissage par descente du gradient. Nous pr{\'{e}}sentons aussi les bases th{\'{e}}oriques qui permettent d’{\'{e}}tudier la g{\'{e}}n{\'{e}}ralisation {\`{a}} de nouvelles t{\^{a}}ches d’une r{\`{e}}gle d’apprentissage dont les param{\`{e}}tres ont {\'{e}}t{\'{e}} estim{\'{e}}s {\`{a}} partir d’un certain ensemble de t{\^{a}}ches. Enfin, nous exposons bri{\`{e}}vement les r{\'{e}}sultats d’une exp{\'{e}}rience consistant {\`{a}} trouver, par descente du gradient, une r{\`{e}}gle d’apprentissage pouvant r{\'{e}}soudre plusieurs t{\^{a}}ches bool{\'{e}}ennes lin{\'{e}}airement et non lin{\'{e}}airement s{\'{e}}parables.},
addressfr={N{\^i}es, France},topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1992:oban,
     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
      title = {On the Optimization of a Synaptic Learning rule},
  booktitle = {Conference on Optimality in Biological and Artificial Networks},
       year = {1992},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_oban.ps},
   abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia). Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
addressfr={Dallas, USA},topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1992:snowbird,
     author = {Bengio, Yoshua},
      title = {Representations Based on Articulatory Dynamics for Speech Recognition},
  booktitle = {1992 Neural Networks for Computing Conference},
       year = {1992},
topics={PriorKnowledge,Speech},cat={C},
}

@INPROCEEDINGS{bengio:1993:icann,
     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
     editor = {Gielen, S. and Kappen, B.},
      title = {Generalization of a Parametric Learning Rule},
  booktitle = {{ICANN} '93: Proceedings of the International Conference on Artificial Neural Networks},
       year = {1993},
      pages = {502},
  publisher = {Springer-Verlag},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1993_icann.ps},
   abstract = {In previous work ([4,2,1])	 we discussed the subject of parametric learning rules for neural networks. In this article, we present a theoretical basis permitting to study the generalization property of a learning rule whose parameters are estimated from a set of learning tasks. By generalization, we mean the possibility of using the learning rule to learn solve new tasks. Finally, we describe simple experiments on two-dimensional categorization tasks and show how they corroborate the theoretical results.},
addressfr={Amsterdam, Pays-Bas},topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1993:snowbird,
     author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
      title = {The Problem of Learning Long-Term Dependencies in Recurrent Networks},
  booktitle = {1993 Neural Networks for Computing Conference},
       year = {1993},
topics={LongTerm},cat={C},
}

@TECHREPORT{bengio:1994,
       author = {Bengio, Yoshua and Frasconi, Paolo},
        title = {An {EM} Approach to Learning Sequential Behavior},
       number = {DSI 11-94},
         year = {1994},
  institution = {Universita di Firenze, Dipartimento di Sistemi e Informatica},
topics={LongTerm},cat={T},
}

@INPROCEEDINGS{bengio:1994:acfas,
     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
      title = {Optimisation d'une r{\`{e}}gle d'apprentissage pour r{\'{e}}seaux de neurones artificiels},
  booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
       year = {1994},
topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1994:snowbird,
     author = {Bengio, Yoshua and Frasconi, Paolo},
      title = {An {EM} Algorithm for Target Propagation},
  booktitle = {1994 Neural Networks for Computing Conference},
       year = {1994},
topics={LongTerm},cat={C},
}

@INPROCEEDINGS{bengio:1994:wcci,
     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
      title = {Use of Genetic Programming for the Search of a New Learning Rule for Neural Networks},
  booktitle = {Proceedings of the First Conference on Evolutionary Computation, {IEEE} World Congress on Computational Intelligence},
       year = {1994},
      pages = {324--327},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1994_wcci.ps},
   abstract = {In previous work ([1,2,3]), we explained how to use standard optimization methods such as simulated annealing, gradient descent and genetic algorithms to optimize a parametric function which could be used as a learning rule for neural networks. To use these methods, we had to choose a fixed number of parameters and a rigid form for the learning rule. In this article, we propose to use genetic programming to find not only the values of rule parameters but also the optimal number of parameters and the form of the rule. Experiments on classification tasks suggest genetic programming finds better learning rules than other optimization methods. Furthermore, the best rule found with genetic programming outperformed the well-known backpropagation algorithm for a given set of tasks.},
topics={BioRules},cat={C},
}

@INPROCEEDINGS{bengio:1994b:acfas,
     author = {Bengio, Yoshua and Frasconi, Paolo},
      title = {R{\'{e}}seaux de neurones {M}arkoviens pour l'inf{\'{e}}rence grammaticale},
  booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
       year = {1994},
topics={Markov,Language},cat={C},
}

@ARTICLE{bengio:1995:npl,
    author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
     title = {On the Search for New Learning Rules for {ANN}s},
   journal = {Neural Processing Letters},
    volume = {2},
    number = {4},
      year = {1995},
     pages = {26--30},
  abstract = {In this paper, we present a framework where a learning rule can be optimized within a parametric learning rule space. We define what we call parametric learning rules and present a theoretical study of their generalization properties when estimated from a set of learning tasks and tested over another set of tasks. We corroborate the results of this study with practical experiments.},
topics={BioRules},cat={J},
}

@INCOLLECTION{bengio:1995:oban,
     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
     editor = {Levine, D. S. and Elsberry, W. R.},
      title = {{O}n the Optimization of a Synaptic Learning Rule},
  booktitle = {Optimality in Biological and Artificial Networks},
       year = {1995},
  publisher = {Lawrence Erlbaum},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1995_oban.pdf},
   abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia), Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
topics={BioRules},cat={B},
}

@TECHREPORT{bengio:1996:udem,
       author = {Bengio, Yoshua and Bengio, Samy},
        title = {Training Asynchronous Input/Output Hidden {M}arkov Models},
       number = {1013},
         year = {1996},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1996_udem.ps},
topics={Markov},cat={T},
}

@INPROCEEDINGS{bengio:1997:snowbird,
     author = {Bengio, Yoshua and Bengio, Samy and Singer, Yoram and Isabelle, Jean-Fran{\c c}ois},
      title = {On the Clusterization of Probabilistic Transducers},
  booktitle = {1997 Neural Networks for Computing Conference},
       year = {1997},
topics={HighDimensional},cat={C},
}

@INPROCEEDINGS{bengio:1998:snowbird,
     author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
      title = {Stochastic Learning of Strategic Equilibria for Auctions},
  booktitle = {Learning Conference},
       year = {1998},
topics={Auction},cat={C},
}

@TECHREPORT{bengio:1998:udem,
       author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
        title = {Stochastic Learning of Strategic Equilibria for Auctions},
       number = {1119},
         year = {1998},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1998_udem.pdf},
     abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
topics={Auction},cat={T},
}

@INPROCEEDINGS{bengio:1999:snowbird,
     author = {Bengio, Yoshua and Latendresse, Simon and Dugas, Charles},
      title = {Gradient-Based Learning of Hyper-Parameters},
  booktitle = {Learning Conference},
       year = {1999},
topics={ModelSelection},cat={C},
}

@INPROCEEDINGS{bengio:1999:titration,
     author = {Bengio, Yoshua and Brault, J-J. and Major, Fran{\c c}ois and Neal, R. and Pigeon, Steven},
      title = {Learning Algorithms for Sorting Compounds from Titration Curves},
  booktitle = {Symposium on New Perspectives for Computer-Aided Drug Design},
       year = {1999},
topics={Speech},cat={C},
}

@ARTICLE{bengio:2000:ieeetrnn,
    author = {Bengio, Samy and Bengio, Yoshua},
     title = {Taking on the Curse of Dimensionality in Joint Distributions Using Neural Networks},
   journal = {IEEE Transaction on Neural Networks special issue on data mining and knowledge discovery},
    volume = {11},
    number = {3},
      year = {2000},
     pages = {550--557},
  abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow at most as the square of the number of variables, using a multi_layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables (thus reducing significantly the number of parameters). Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
topics={HighDimensional,Unsupervised,Mining},cat={J},
}

@INPROCEEDINGS{bengio:2000:nips,
    author = {Bengio, Yoshua and Bengio, Samy},
     title = {Modeling High-Dimensional Discrete Data with Multi-Layer Neural Networks},
      year = {2000},
     pages = {400--406},
  crossref = {NIPS12-shorter},
  abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow only at most as the square of the number of variables, using a multi-layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables. Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
topics={HighDimensional,Unsupervised},cat={C},
}

@ARTICLE{bengio:2003,
    author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
     title = {A Neural Probabilistic Language Model},
    volume = {3},
      year = {2003},
     pages = {1137--1155},
  crossref = {JMLR-shorter},
  abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
topics={Markov,Unsupervised,Language},cat={J},
}

@TECHREPORT{bengio:socs-1990,
       author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
        title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
       number = {TR-SOCS-90.22},
         year = {1990},
  institution = {School of Computer Science, McGill University},
topics={Markov},cat={T},
}

@INPROCEEDINGS{bengioc:1994:acfas,
     author = {Bengio, Yoshua and {LeCun}, Yann},
      title = {Reconnaissance de mots manuscrits avec r{\'{e}}seaux de neurones et mod{\`{e}}les de {M}arkov},
  booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
       year = {1994},
topics={Markov,Speech},cat={C},
}

@TECHREPORT{Bengio_Bottou92,
       author = {Bengio, Yoshua and Bottou, {L{\'{e}}on}},
        title = {A New Approach to Estimating Probability Density Functions with Artificial Neural Networks},
       number = {TR-92.02},
         year = {1992},
  institution = {Massachusetts Institute of Technology, Dept. Brain and Cognitive Sciences},
topics={HighDimensional},cat={T},
}

@INCOLLECTION{bengio_extension_nips_2003,
    author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Delalleau, Olivier and Le Roux, Nicolas and Ouimet, Marie},
  keywords = {dimensionality reduction, eigenfunctions learning, Isomap, kernel {PCA}, locally linear embedding, Nystrom formula, spectral methods},
     title = {Out-of-Sample Extensions for {LLE}, Isomap, {MDS}, Eigenmaps, and Spectral Clustering},
      year = {2004},
  crossref = {NIPS16-shorter},
  abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides a unified framework for extending Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (for dimensionality reduction) as well as for Spectral Clustering. This framework is based on seeing these algorithms as learning eigenfunctions of a data-dependent kernel. Numerical experiments show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms due to the choice of training data.},
topics={HighDimensional,Kernel,Unsupervised},cat={C},
}

@ARTICLE{Bengio_Gingras98a,
    author = {Bengio, Yoshua and Gingras, Fran{\c c}ois and Goulard, Bernard and Lina, Jean-Marc},
     title = {Gaussian Mixture Densities for Classification of Nuclear Power Plant Data},
   journal = {Computers and Artificial Intelligence},
    volume = {17},
    number = {2-3},
      year = {1998},
     pages = {189--209},
  abstract = {In this paper we are concerned with the application of learning algorithms to the classification of reactor states in nuclear plants. Two aspects must be considered, (1) some types of events (e.g., abnormal or rare) will not appear in the data set, but the system should be able to detect them, (2) not only classification of signals but also their interpretation are important for nuclear plant monitoring. We address both issues with a mixture of mixtures of Gaussians in which some parameters are shared to reflect the similar signals observed in different states of the reactor. An {EM} algorithm for these shared Gaussian mixtures is presented. Experimental results on nuclear plant data demonstrate the advantages of the proposed approach with respect to the above two points.},
topics={Mining},cat={J},
}

@ARTICLE{Bengio_Gingras98b,
    author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
     title = {Handling Asynchronous or Missing Financial Data with Recurrent Networks},
   journal = {International Journal of Computational Intelligence and Organizations},
    volume = {1},
    number = {3},
      year = {1998},
     pages = {154--163},
  abstract = {An important issue with many sequential data analysis problems, such as those encountered in financial data sets, is that different variables are known at different frequencies, at different times (asynchronicity), or are sometimes missing. To address this issue we propose to use recurrent networks with feedback into the input units, based on two fundamental ideas. The first motivation is that the “filled-in” value of the missing variable may not only depend in complicated ways on the value of this variable in the past of the sequence but also on the current and past values of other variables. The second motivation is that, for the purpose of making predictions or taking decisions, it is not always necessary to fill in the best possible value of the missing variables. In fact, it is sufficient to fill in a value which helps the system make better predictions or decisions. The advantages of this approach are demonstrated through experiments on several tasks.},
topics={Finance,Missing},cat={J},
}

@INPROCEEDINGS{Bengio_icassp90,
     author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Normandin, Yves},
      title = {A Hybrid Coder for Hidden {M}arkov Models Using a Recurrent Neural Network},
  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
       year = {1990},
      pages = {537--540},
topics={Markov,Speech},cat={C},
}

@INPROCEEDINGS{Bengio_LeCun94,
    author = {Bengio, Yoshua and {LeCun}, Yann and Henderson, Donnie},
     title = {Globally Trained Handwritten Word Recognizer using Spatial Representation, Space Displacement Neural Networks and Hidden {M}arkov Models},
      year = {1994},
     pages = {937--944},
  crossref = {NIPS6-shorter},
  abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
topics={Speech},cat={C},
}

@ARTICLE{Bengio_LeCun95,
    author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
     title = {LeRec: A {NN}/{HMM} Hybrid for On-Line Handwriting Recognition},
   journal = {Neural Computation},
    volume = {7},
    number = {6},
      year = {1995},
     pages = {1289--1303},
  abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm.  Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
topics={PriorKnowledge,Speech},cat={J},
}

@ARTICLE{Bengio_prel92,
    author = {Bengio, Yoshua and Gori, Marco and De Mori, Renato},
     title = {Learning the Dynamic Nature of Speech with Back-propagation for Sequences},
   journal = {Pattern Recognition Letters},
    volume = {13},
    number = {5},
      year = {1992},
     pages = {375--385},
      note = {(Special issue on Artificial Neural Networks)},
topics={Speech},cat={J},
}

@ARTICLE{Bengio_trnn92,
    author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
     title = {Global Optimization of a Neural Network-Hidden {M}arkov Model Hybrid},
   journal = {IEEE Transactions on Neural Networks},
    volume = {3},
    number = {2},
      year = {1992},
     pages = {252--259},
topics={Markov},cat={J},
}

@TECHREPORT{Bergstra+2009,
       author = {Bergstra, James and Desjardins, Guillaume and Lamblin, Pascal and Bengio, Yoshua},
        title = {Quadratic Polynomials Learn Better Image Features},
       number = {1337},
         year = {2009},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {The affine-sigmoidal hidden unit (of the form $\sigma(ax+b)$) 
    is a crude predictor of neuron response in visual area V1.
    More descriptive models of V1 have been advanced that are no more computationally expensive,
    yet artificial neural network research continues to focus on networks of affine-sigmoidal models.
    This paper identifies two qualitative differences between the affine-sigmoidal hidden unit
    and a particular recent model of V1 response:
    a) the presence of a low-rank quadratic term in the argument to $\sigma$, 
    and b) the use of a gentler non-linearity than the $\tanh$ or logistic sigmoid.
    We evaluate these model ingredients by training single-layer 
    neural networks to solve three image classification tasks.
    We experimented with fully-connected hidden units,
    as well as locally-connected units and convolutional units
    that more closely mimic the function and connectivity of the visual system.
    On all three tasks, both the quadratic interactions and the gentler non-linearity
    lead to significantly better generalization.
    The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.}
}

@ARTICLE{Bergstra+al-2010,
    author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
     title = {Suitability of V1 Energy Models for Object Classification},
   journal = {Neural Computation},
      year = {2010},
      note = {to appear}
}

@INPROCEEDINGS{bergstra+al:2010-scipy,
     author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Pascanu, Razvan and Desjardins, Guillaume and Turian, Joseph and Bengio, Yoshua},
      title = {Theano: a {CPU} and {GPU} Math Expression Compiler},
  booktitle = {Proceedings of the Python for Scientific Computing Conference ({SciPy})},
       year = {2010},
       note = {Oral}
}

@MISC{bergstra+al:2010-sharcnet,
        author = {Bergstra, James and Bengio, Yoshua},
         title = {{GPU} Programming with Theano},
          year = {2010},
  howpublished = {{SHARCNET} Research Day},
          note = {Oral}
}

@MISC{bergstra+al:2010snowbird,
     author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Turian, Joseph and Desjardins, Guillaume and Pascanu, Razvan and Erhan, Dumitru and Delalleau, Olivier and Bengio, Yoshua},
      title = {Deep Learning on {GPU}s with Theano},
  booktitle = {The Learning Workshop},
       year = {2010},
       note = {Oral}
}

@INPROCEEDINGS{Bergstra+Bengio-2009,
     author = {Bergstra, James and Bengio, Yoshua},
      title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
       year = {2009},
      pages = {99--107},
  publisher = {MIT Press},
        url = {http://books.nips.cc/papers/files/nips22/NIPS2009_0933.pdf},
   crossref = {NIPS22}
}

@ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006,
    author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
     title = {Aggregate Features and AdaBoost for Music Classification},
   journal = {Machine Learning},
    volume = {65},
      year = {2006},
     pages = {473--484},
      issn = {0885-6125},
  abstract = {We present an algorithm that predicts musical genre and artist from an audio waveform. Our method uses the ensemble learner ADABOOST to select from a set of audio features that have been extracted from segmented audio and then aggregated. Our classifier proved to be the most effective method for genre classification at the recent MIREX 2005 international contests in music information extraction, and the second-best method for recognizing artists. This paper describes our method in detail, from feature extraction to song classification, and presents an evaluation of our method on three genre databases and two artist-recognition databases. Furthermore, we present evidence collected from a variety of popular features and classifiers that the technique of classifying features aggregated over segments of audio is better than classifying either entire songs or individual short-timescale features.},
PDF = {papers/2006_ml_draft.pdf},
  SOURCE = {OwnPublication},
}

@INPROCEEDINGS{bergstra+lacoste+eck:2006,
     author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas},
      title = {Predicting Genre Labels for Artists using FreeDB},
  booktitle = {Proc. 7th International Conference on Music Information Retrieval ({ISMIR})},
       year = {2006},
      pages = {85--88},
  publisher = {University of Victoria},
SOURCE = {OwnPublication},
  PDF = {papers/2006_ismir_freedb.pdf},
}

@INPROCEEDINGS{bergstra+mandel+eck:2010,
     author = {Bergstra, James and Mandel, Michael and Eck, Douglas},
      title = {Scalable Genre and Tag Prediction with Spectral Covariance},
  booktitle = {{ISMIR}},
       year = {2010},
      pages = {507--512},
}

@MASTERSTHESIS{Bergstra-Msc-2006,
    author = {Bergstra, James},
  keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale},
     title = {Algorithms for Classifying Recorded Music by Genre},
      year = {2006},
    school = {Universit{\'{e}} de Montreal},
  abstract = {Ce m{\'{e}}moire traite le probl{\`{e}}me de la classification automatique de signaux musicaux par genre. Dans un premier temps, je pr{\'{e}}sente une technique utilisant l'apprentissage machine pour classifier des statistiques extraites sur des segments du signal sonore. Malgr{\'{e}} le fait que cette technique a d{\'{e}}j{\`{a}} {\'{e}}t{\'{e}} explor{\'{e}}e, mon m{\'{e}}moire est le premier {\`{a}} investiguer l'influence de la longueur et de la quantit{\'{e}} de ces segments sur le taux de classification. J'explore {\'{e}}galement l'importance d'avoir des segments contigus dans le temps. Les segments d'une {\`{a}} trois secondes apportent une meilleure performance, mais pour ce faire, ils doivent {\^{e}}tre suffisamment nombreux. Il peut m{\^{e}}me {\^{e}}tre utile d'augmenter la quantit{\'{e}} de segments jusqu'{\`{a}} ce qu'ils se chevauchent. Dans les m{\^{e}}mes exp{\'{e}}riences, je pr{\'{e}}sente une formulation alternative des descripteurs d'audio nomm{\'{e}}e Melfrequency Cepstral Coefficient (MFCC) qui am{\`{e}}ne un taux de classification de 81 \% sur un jeux de donn{\'{e}}es pour lequel la meilleure performance publi{\'{e}}e est de 71 \%. Cette m{\'{e}}thode de segmentation des chansons, ainsi que cette formulation alternative, ont pour but d'am{\'{e}}liorer l'algorithme gagnant du concours de classification de genre de MIREX 2005, d{\'{e}}velopp{\'{e}} par Norman Casagrande et moi. Ces exp{\'{e}}riences sont un approfondissement du travail entam{\'{e}} par Bergstra et al. [2006a], qui d{\'{e}}crit l'algorithme gagnant de ce concours.
Dans un deuxi{\`{e}}me temps, je pr{\'{e}}sent une m{\'{e}}thode qui utilise FreeDB, une base de donn{\'{e}}es d'information sur les albums, pour attribuer {\`{a}} un artiste une distribution de probabilit{\'{e}} sur son genre. Avec une petite base de donn{\'{e}}es, faite {\`{a}} la main, je montre qu'il y a une haute corr{\'{e}}lation entre cette distribution et l'{\'{e}}tiquette de genre traditionnel. Bien qu'il reste {\`{a}} d{\'{e}}montrer que cette m{\'{e}}thode est utile pour organiser une collection de musique, ce r{\'{e}}sultat sugg{\`{e}}re qu'on peut maintenant {\'{e}}tiqueter de grandes bases de musique automatiquement {\`{a}} un faible co{\^{u}}t, et par cons{\'{e}}quent de poursuivre plus facilement la recherche en classification {\`{a}} grande {\'{e}}chelle. Ce travail sera publi{\'{e}} comme Bergstra et al. [2006b] {\`{a}} ISMIR 2006.}
}

@INPROCEEDINGS{bergstra:2010cosyne,
     author = {Bergstra, James and Bengio, Yoshua and Lamblin, Pascal and Desjardins, Guillaume and Louradour, Jerome},
      title = {Image classification with complex cell neural networks},
  booktitle = {Computational and systems neuroscience (COSYNE)},
       year = {2010},
       note = {Poster},
        url = {http://www.frontiersin.org/conferences/individual_abstract_listing.php?conferid=770&pap=3626&ind_abs=1&pg=335},
        doi = {10.3389/conf.fnins.2010.03.00334}
}

@INPROCEEDINGS{biaslearn:2000:ijcnn,
     author = {Ghosn, Joumana and Bengio, Yoshua},
      title = {Bias Learning, Knowledge Sharing},
  booktitle = {International Joint Conference on Neural Networks 2000},
     volume = {I},
       year = {2000},
      pages = {9--14},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/ijcnn_manifold.pdf},
   abstract = {Biasing the hypothesis space of a learner has been shown to improve generalisation performances. Methods for achieving this goal have been proposed, that range from deriving and introducing a bias into a learner to automatically learning the bias. In the latter case, most methods learn the bias by simultaneously training several related tasks derived from the same domain and imposing constraints on their parameters. We extend some of the ideas presented in this field and describe a new model that parameterizes the parameters of each task as a function of an affine manifold defined in parameter space and a point lying on the manifold. An analysis of variance on a class of learning tasks is performed that shows some significantly improved performances when using the model.},
topics={MultiTask},cat={C},
}

@ARTICLE{biaslearn:2003:tnn,
    author = {Ghosn, Joumana and Bengio, Yoshua},
     title = {Bias Learning, Knowledge Sharing},
   journal = {IEEE Transaction on Neural Networks},
    volume = {14},
    number = {4},
      year = {2003},
     pages = {748--765},
  abstract = {Biasing properly the hypothesis space of a learner has been shown to improve generalization performance. Methods for achieving this goal have been proposed, that range from designing and introducing a bias into a learner to automatically learning the bias. Multitask learning methods fall into the latter category. When several related tasks derived from the same domain are available, these methods use the domain-related knowledge coded in the training examples of all the tasks as a source of bias. We extend some of the ideas presented in this field and describe a new approach that identifies a family of hypotheses, represented by a manifold in hypothesis space, that embodies domain-related knowledge. This family is learned using training examples sampled from a group of related tasks. Learning models trained on these tasks are only allowed to select hypotheses that belong to the family. We show that the new approach encompasses a large variety of families which can be learned. A statistical analysis on a class of related tasks is performed that shows significantly improved performances when using this approach.},
topics={MultiTask},cat={J},
}

@MASTERSTHESIS{Boisvert-Mcs-2005,
    author = {Boisvert, Maryse},
  keywords = {Algorithme {EM} , D{\'{e}}composition en valeurs singuli{\`{e}}res , D{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique , Mod{\`{e}}les graphiques, WordNet },
     title = {R{\'{e}}duction de dimension pour mod{\`{e}}les graphiques probabilistes appliqu{\'{e}}s {\`{a}} la d{\'{e}}sambiguisation s{\'{e}}mantique},
      year = {2005},
    school = {Universit{\'{e}} de Montr{\'{e}}al}
}

@INPROCEEDINGS{bonneville98,
     author = {Bonneville, Martin and Meunier, Jean and Bengio, Yoshua and Soucy, Jean-Paul},
      title = {Support Vector Machines for Improving the classification of Brain Pet Images},
  booktitle = {SPIE Medical Imaging},
       year = {1998},
topics={Kernel},cat={C},
}

@INPROCEEDINGS{Bottou+Bengio95,
    author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua},
     title = {Convergence Properties of the {K}-Means Algorithm},
      year = {1995},
     pages = {585--592},
  crossref = {NIPS7-shorter},
  abstract = {This paper studies the convergence properties of the well known K-Means clustering algorithm. The K-Means algorithm can be described either as a gradient descent algorithm or by slightly extending the mathematics of the {EM} algorithm to this hard threshold case. We show that the K-Means algorithm actually minimizes the quantization error using the very fast Newton algorithm.},
topics={Unsupervised},cat={C},
}

@ARTICLE{bottou-98,
    author = {Bottou, {L{\'{e}}on} and Haffner, Patrick and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
     title = {High Quality Document Image Compression with {DjVu}},
   journal = {Journal of Electronic Imaging},
    volume = {7},
    number = {3},
      year = {1998},
     pages = {410--425},
topics={Compression},cat={J},
}

@INPROCEEDINGS{Bottou-dcc98,
     author = {Bottou, {L{\'{e}}on} and G. Howard, Paul and Bengio, Yoshua},
     editor = {Society, {IEEE} Computer},
      title = {The Z-Coder Adaptive Binary Coder},
  booktitle = {Data Compression Conference},
       year = {1998},
        url = {http://leon.bottou.org/papers/bottou-howard-bengio-98},
topics={Compression},cat={C},
}

@INPROCEEDINGS{bottou-lecun-bengio-97,
     author = {Bottou, {L{\'{e}}on} and {LeCun}, Yann and Bengio, Yoshua},
      title = {Global Training of Document Processing Systems using Graph Transformer Networks},
  booktitle = {Proc. of Computer Vision and Pattern Recognition},
       year = {1997},
      pages = {490--494},
  publisher = {IEEE},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.pdf},
topics={PriorKnowledge,Speech},cat={C},
}

@TECHREPORT{bottou96TR,
       author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann},
        title = {Document analysis with transducers},
       number = {Technical Memorandum HA615600-960701-01TM},
         year = {1996},
  institution = {AT\&T Labs},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/transducer-tm.ps.gz},
topics={HighDimensional},cat={T},
}

@TECHREPORT{bottou97TR,
       author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and G. Howard, Paul},
        title = {Z-Coder: A Fast Adaptive Binary Arithmetic Coder},
       number = {Technical Memorandum HA615600-970721-02TM},
         year = {1997},
  institution = {AT\&T Labs},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/zcoder-tm.ps.gz},
topics={Compression},cat={T},
}

@MASTERSTHESIS{Bouchard-Msc-2007,
    author = {Bouchard, Lysiane},
  keywords = {auditory cortex, fMRI, linear classifier, logistic regression, na{\"{\i}}ve bayesian gaussian model, neuroimaging, spectro-temporal modulation, support vectors machine},
     title = {Analyse par apprentissage automatique des r{\'{e}}ponses fMRI du cortex auditif {\`{a}} des modulations spectro-temporelles.},
      year = {2009},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {The application of linear machine learning classifiers to the analysis of brain imaging data (fMRI) has led to several interesting breakthroughs in recent years. These classifiers combine the responses of the voxels to detect and categorize different brain states. They allow a more agnostic analysis than conventional fMRI analysis that systematically treats weak and distributed patterns as unwanted noise. In this project, we use such classifiers to validate an hypothesis concerning the encoding of sounds in the human brain. More precisely, we attempt to locate neurons tuned to spectral and temporal modulations in sound. We use fMRI recordings of brain responses of subjects listening to 49 different spectro-temporal modulations. The analysis of fMRI data through linear classifiers is not yet a standard procedure in this field. Thus, an important objective of this project, in the long term, is the development of new machine learning algorithms specialized for neuroimaging data. For these reasons, an important part of the experiments is dedicated to studying the behaviour of the classifiers. We are mainly interested in 3 standard linear classifiers, namely the support vectors machine algorithm (linear), the logistic regression algorithm (regularized) and the na{\"{\i}}ve bayesian gaussian model (shared variances).}
}

@PHDTHESIS{Boufaden-Phd-2005,
    author = {Boufaden, Narj{\`{e}}s},
     title = {Extraction d’information {\`{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es},
      year = {2005},
    school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel}
}

@TECHREPORT{Breuleux+al-TR-2010,
       author = {Breuleux, Olivier and Bengio, Yoshua and Vincent, Pascal},
        title = {Unlearning for Better Mixing},
       number = {1349},
         year = {2010},
  institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO},
     abstract = {Two learning algorithms were recently proposed – Herding and Fast Persistent Contrastive Divergence (FPCD) – which share the following interesting characteristic: they exploit changes in the model parameters while sampling in order to escape modes and mix better, during the sampling process that is part of the learning algorithm. We first justify such approaches as ways to escape modes while approximately keeping the same asymptotic distribution of the {Markov} chain. We then extend FPCD using an idea borrowed from Herding in order to obtain a pure sampling algorithm and show empirically that this FPCD-sampler yields substantially better samples than Gibbs sampling. Because these algorithms entangle the model and the sampling algorithm and we want to evaluate both (but particularly how well the sampling schemes mix), it is not always easy to evaluate them, so we propose a “black-box” approach based on how well and how quickly the samples generated by a model “cover” the test set examples. We empirically study these algorithms and variations with this perspective and these new evaluation tools in order to better understand their strengths and limitations.}
}

@INPROCEEDINGS{Carreau+Bengio-2007,
     author = {Carreau, Julie and Bengio, Yoshua},
      title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data},
  booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
       year = {2007},
  publisher = {Omnipress},
   abstract = {We propose an estimator for the conditional density p(Y|X) that can adapt for asymmetric heavy tails which might depend on X. Such estimators have important applications in finance and insurance. We draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail.  We use this hybrid in a multi-modal mixture in order to obtain a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional density estimator, the parameters of the mixture estimator can be seen as
functions of X and these functions learned. We show experimentally that this approach better models the conditional density in terms of likelihood than compared competing algorithms : conditional mixture models with other types of components and multivariate nonparametric models.},
date={21-24}
}

@ARTICLE{Carreau+Bengio-2009,
    author = {Carreau, Julie and Bengio, Yoshua},
     title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distribution},
   journal = {IEEE Transactions on Neural Networks},
    volume = {20},
    number = {7},
      year = {2009},
     pages = {1087--1101},
      issn = {1045-9227},
  abstract = {In many cases, we observe some variables X that contain predictive information over a scalar variable of interest Y, with (X,Y) pairs observed in a training set. We can take advantage of this information to estimate the conditional density P(Y\X = x). In this paper, we propose a conditional mixture model with hybrid {Pareto} components to estimate P(Y\X = x).The hybrid {Pareto} is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. A third parameter, in addition to the location and spread parameters of the Gaussian, controls the heaviness of the upper tail. Using the hybrid {Pareto} in a mixture model results in a nonparametric estimator that can adapt to multimodality, asymmetry, and heavy tails. A conditional density estimator is built by modeling the parameters of the mixture estimator as functions of X. We use a neural network to implement these functions. Such conditional density estimators have important applications in many domains such as finance and insurance. We show experimentally that this novel approach better models the conditional density in terms of likelihood, compared to competing algorithms: conditional mixture models with other types of components and a classical kernel-based nonparametric model.}
}

@ARTICLE{Carreau+Bengio-extreme-2009,
    author = {Carreau, Julie and Bengio, Yoshua},
     title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tailed Data: the univariate case},
   journal = {Extremes},
    volume = {12},
    number = {1},
      year = {2009},
     pages = {53--76},
  abstract = {Density estimators that can adapt to asymmetric heavy tails are required in many applications such as finance and insurance. Extreme Value Theory (EVT) has developped principled methods based on asymptotic results to estimate the tails of most distributions. However, the finite sample approximation might introduce a severe bias in many cases. Moreover, the full range of the distribution is often needed, not only the tail area. On the other hand, non-parametric methods, while being powerful where data are abundant, fail to extrapolate properly in the tail area. We put forward a non-parametric density estimator that brings together the strengths of non-parametric density estimation and of EVT.  A hybrid {Pareto} distribution that can be used in a mixture model is proposed to extend the generalized {Pareto} (GP) to the whole real axis. Experiments on simulated data show the following. On one hand, the mixture of hybrid {Pareto}s converges faster in terms of log-likelihood and provides good estimates of the tail of the distributions when compared with other density estimators including the GP distribution. On the other hand, the mixture of hybrid {Pareto}s offers an alternate way to estimate the tail index which is comparable to the one estimated with the standard GP methodology. The mixture of hybrids is also evaluated on the Danish fire insurance data set.}
}

@PHDTHESIS{Carreau-PhD-2007,
    author = {Carreau, Julie},
  keywords = {density estimation, extreme values, generalized {Pareto} distribution, heavy-tailed distribution, mixture of distributions, neural networks},
     title = {Mod{\`{e}}les {Pareto} hybrides pour distributions asym{\'{e}}triques et {\`{a}} queues lourdes},
      year = {2007},
    school = {UdeM},
  abstract = {We put forward a class of density estimators that can adapt to asymmetric, multi-modal and heavy-tailed distributions. Such distributions occur in many application domains such as finance and insurance. Mixture of gaussians are flexible non-parametric density estimators that have good approximation properties when the number of components is well chosen with respect to the training set size. However, those models are performing poorly on heavy-tailed data because few observations occur in the tail area. To solve this problem, we resort to extreme value theory where methods based on sound parametric assumptions have been developped to enable extrapolation beyond the range of the observations. More precisely, we build on the PoT method that was developped in hydrology where PoT stands for  "Peaks-over-Threshold". The observations exceeding a given threshold are modeled by the generalized {Pareto} distribution. This distribution can approximate arbitrarily well the tail of most distributions. We build a new distribution, the hybrid {Pareto}, by stitching together a truncated Normal and a generalized {Pareto} distribution. We impose continuity constraints at the junction point. The hybrid {Pareto} is thus a smooth distribution that can be used in a mixture model.  The behavior of the upper tail of the hybrid is similar to the behavior of the generalized {Pareto} tail. Moreover, the threshold inherent in the the PoT methodology can now be defined implicitly as the junction point of the component with the heaviest tail. This component also determines the tail index of the mixture. Hence, the hybrid {Pareto} mixture offers an alternate way to estimate the tail index associated with heavy-tailed data. In several applications, information that has predictive power on the variable of interest is available. In that case, we want to model the conditional density of Y given X, the vector containing predictive information. When the distribution of Y given X is asymmetric, multi-modal and heavy-tailed, we propose to use a mixure of hybrid {Pareto}s whose parameters are functions of X. Those functions are implemented by means of a neural network with one hidden layer. Neural neworks are non-parametric models that can, in principle, approximate any continuous function.  Experiments on artificial and real data sets show that the hybrid {Pareto} mixture, unconditional and conditional, outperforms other density estimators in terms of log-likelihood.}
}

@INPROCEEDINGS{casagrande+eck+kegl:icmc2005,
     author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
      title = {Geometry in Sound: A Speech/Music Audio Classifier Inspired by an Image Classifier},
  booktitle = {{Proceedings of the International Computer Music Conference (ICMC)}},
       year = {2005},
      pages = {207--210},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_icmc_casagrande.pdf},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{casagrande+eck+kegl:ismir2005,
     author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
      title = {Frame-Level Audio Feature Extraction using {A}da{B}oost},
  booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
       year = {2005},
      pages = {345--350},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir_casagrande.pdf},
source={OwnPublication},
sourcetype={Conference},
}

@PROCEEDINGS{ccai2006,
     editor = {Lamontagne, Luc and Marchand, Mario},
      title = {Advances in Artificial Intelligence, 19th Conference of the Canadian Society for Computational Studies of Intelligence, Canadian AI 2006, Qu{\'{e}}bec City, Qu{\'{e}}bec, Canada, June 7-9, 2006, Proceedings},
  booktitle = {Canadian Conference on AI},
     series = {Lecture Notes in Computer Science},
     volume = {4013},
       year = {2006},
  publisher = {Springer}
}

@INPROCEEDINGS{Chapados+Bengio-2006,
     author = {Chapados, Nicolas and Bengio, Yoshua},
      title = {The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization},
  booktitle = {AI06},
       year = {2006},
      pages = {491-502}
}

@INPROCEEDINGS{Chapados+Bengio-2007,
     author = {Chapados, Nicolas and Bengio, Yoshua},
      title = {Forecasting Commodity Contract Spreads with Gaussian Process},
  booktitle = {13th Intarnational Conference on Computing in Economics and Finance},
       year = {2007},
   abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
}

@ARTICLE{Chapados+Bengio-2008-JOC,
    author = {Chapados, Nicolas and Bengio, Yoshua},
     title = {Noisy K Best-Paths for Approximate Dynamic Programming with Application to Portfolio Optimization},
   journal = {Journal of Computers},
    volume = {2},
    number = {1},
      year = {2007},
     pages = {12--19},
  abstract = {We describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-bestpaths algorithm. We consider an application in financial portfolio management where we can train a controller to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating experimental results using a kernel-based controller architecture that would not normally be considered in traditional
reinforcement learning or approximate dynamic programming.We further show that using a non-additive criterion (incremental Sharpe Ratio) yields a noisy K-best-paths extraction problem, that can give substantially improved performance.}
}

@MASTERSTHESIS{Chapados-Msc-2000,
    author = {Chapados, Nicolas},
     title = {Crit{\`{e}}res d'optimisation d'algorithmes d'apprentissage en gestion de portefeuille},
      year = {2000},
    school = {Universit{\'{e}} de Montr{\'{e}}al}
}

@INPROCEEDINGS{chapados2000,
     author = {Chapados, Nicolas and Bengio, Yoshua},
      title = {Cost Functions and Model Combination for {VaR}-Based Asset Allocation Using Neural Networks},
  booktitle = {Computational Finance 2000},
       year = {2000},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/compfin2000_final.pdf},
   abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees using both paradigms are significantly outperforming the benchmark market performance.},
topics={Finance},cat={C},
}

@ARTICLE{chapados:2001,
    author = {Chapados, Nicolas and Bengio, Yoshua},
     title = {Cost Functions and Model Combination for VaR--based Asset Allocation using Neural Networks},
   journal = {IEEE Transactions on Neural Networks},
    volume = {12},
    number = {4},
      year = {2001},
     pages = {890--906},
  abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we
compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees
using both paradigms are significantly outperforming the benchmark market performance.},
topics={Finance},cat={J},
}

@ARTICLE{chapados:2003,
    author = {Bengio, Yoshua and Chapados, Nicolas},
     title = {Extensions to Metric-Based Model Selection},
      year = {2003},
  crossref = {JMLR-shorter},
  abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
topics={ModelSelection,Finance},cat={J},
}

@ARTICLE{chapelle:2001,
    author = {Chapelle, Olivier and Vapnik, Vladimir and Bengio, Yoshua},
     title = {Model Selection for Small Sample Regression},
   journal = {Machine Learning},
      year = {2001},
  abstract = {Model selection is an important ingredient of many machine learning algorithms, in particular when the sample size in small, in order to strike the right trade-off between overfitting and underfitting. Previous classical results for linear regression are based on an asymptotic analysis. We present a new penalization method for performing model selection for regression that is appropriate even for small samples. Our penalization is based on an accurate estimator of the ratio of the expected training error and the expected generalization error, in terms of the expected eigenvalues of the input covariance matrix.},
topics={ModelSelection},cat={J},
}

@INCOLLECTION{chapter-eval-longterm-2001,
     author = {Schmidhuber, Juergen and Hochreiter, Sepp and Bengio, Yoshua},
     editor = {Kolen, J. and Kremer, S.},
      title = {Evaluating Benchmark Problems by Random Guessing},
  booktitle = {Field Guide to Dynamical Recurrent Networks},
       year = {2001},
  publisher = {IEEE Press},
topics={LongTerm},cat={B},
}

@INCOLLECTION{chapter-gradient-document-2001,
     author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
     editor = {Haykin, S. and Kosko, B.},
      title = {Gradient-Based Learning Applied to Document Recognition},
  booktitle = {Intelligent Signal Processing},
       year = {2001},
      pages = {306--351},
  publisher = {IEEE Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-01a.pdf},
   abstract = {Multilayer Neural Networks trained with a backprppagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques.
Real-life document recognition systems are composed of multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to monimize an overall peformance measure.
Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks.
A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with a global training technique to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
topics={PriorKnowledge,Speech},cat={B},
}

@INCOLLECTION{chapter-gradient-flow-2001,
     author = {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo},
     editor = {Kolen, J. and Kremer, S.},
      title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies},
  booktitle = {Field Guide to Dynamical Recurrent Networks},
       year = {2001},
  publisher = {IEEE Press},
topics={LongTerm},cat={B},
}

@INPROCEEDINGS{chemero+eck:1999,
     author = {Chemero, T. and Eck, Douglas},
      title = {An Exploration of Representational Complexity via Coupled Oscillators},
  booktitle = {{Proceedings of the Tenth Midwest Artificial Intelligence and Cognitive Science Society}},
       year = {1999},
  publisher = {MIT Press},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/1999_chemero.pdf},
   abstract = {We note some inconsistencies in a view of representation which takes {\it decoupling} to be of key importance. We explore these inconsistencies using examples of representational vehicles taken from coupled oscillator theory and suggest a new way to reconcile {\it coupling} with {\it absence}. Finally, we tie these views to a teleological definition of representation.},
source={OwnPublication},
sourcetype={Conference},
}

@ARTICLE{ChemInfModel2006,
    author = {Erhan, Dumitru and {L'Heureux}, Pierre-Jean and Yue, Shi Yi and Bengio, Yoshua},
     title = {Collaborative Filtering on a Family of Biological Targets},
   journal = {J. Chem. Inf. Model.},
    volume = {46},
    number = {2},
      year = {2006},
     pages = {626--635},
  abstract = {Building a QSAR model of a new biological target for which few screening data are available is a statistical
challenge. However, the new target may be part of a bigger family, for which we have more screening data.
Collaborative filtering or, more generally, multi-task learning, is a machine learning approach that improves
the generalization performance of an algorithm by using information from related tasks as an inductive
bias. We use collaborative filtering techniques for building predictive models that link multiple targets to
multiple examples. The more commonalities between the targets, the better the multi-target model that can
be built. We show an example of a multi-target neural network that can use family information to produce
a predictive model of an undersampled target. We evaluate JRank, a kernel-based method designed for
collaborative filtering. We show their performance on compound prioritization for an HTS campaign and
the underlying shared representation between targets. JRank outperformed the neural network both in the
single- and multi-target models.},
topics={Bioinformatic,MultiTask},cat={J},
}

@TECHREPORT{collobert:2001:rr01-12,
       author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
        title = {A Parallel Mixture of {SVM}s for Very Large Scale Problems},
       number = {12},
         year = {2001},
  institution = {IDIAP},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/IDIAP-RR-01-12.ps},
     abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
topics={Kernel},cat={T},
}

@ARTICLE{collobert:2002,
    author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
     title = {Parallel Mixture of {SVM}s for Very Large Scale Problem},
   journal = {Neural Computation},
      year = {2002},
  abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
topics={HighDimensional,Kernel},cat={J},
}

@BOOK{collobert:2002:book,
     author = {Collobert, Ronan and Bengio, Yoshua and Bengio, Samy},
     editor = {Lee, S. W. and Verri, A.},
      title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
  booktitle = {Pattern Recognition with Support Vector Machines},
     series = {Lecture Notes in Computer Science},
     volume = {2388},
       year = {2002},
  publisher = {Springer-Verlag},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/2002_mixtures_svm.pdf},
   abstract = {A challenge for statistical learning is to deal with large data sets, e.g. in data mining. Popular learning algorithms such as Support Vector Machines have training time at least quadratic in the number of examples: they are hopeless to solve prolems with a million examples. We propose a "hard parallelizable mixture" methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a "gater" model in such a way that it becoms easy to learn an "expert" model separately in each region of the parition. A probabilistic extension and the use of a set of generative models allows representing a gater so that all pieces of the model are locally trained. For {SVM}s, time complexity appears empirically to locally grow linearly with the number of examples, while generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.},
topics={Kernel},cat={B},
}

@MISC{copyright-CTAI,
        author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Dorion, Christian},
         title = {Commodity Trading Advisor Index},
          year = {2004-2009},
  howpublished = {copyright, and commercialized software license.}
}

@MISC{copyright-PLearn,
        author = {Vincent, Pascal and Bengio, Yoshua},
         title = {{PLearn}, a {C++} Machine Learning Library},
          year = {1998-2009},
  howpublished = {copyright, public domain license.},
           url = {www.plearn.org}
}

@ARTICLE{Cosi90,
    author = {Cosi, Piero and Bengio, Yoshua and De Mori, Renato},
     title = {Phonetically-based multi-layered networks for acoustic property extraction and automatic speech recognition},
   journal = {Speech Communication},
    volume = {9},
    number = {1},
      year = {1990},
     pages = {15--30},
topics={PriorKnowledge,Speech},cat={J},
}

@INCOLLECTION{courville+eck+bengio:nips2009,
     author = {Courville, Aaron and Eck, Douglas and Bengio, Yoshua},
     editor = {},
      title = {An Infinite Factor Model Hierarchy Via a Noisy-Or Mechanism},
  booktitle = {Neural Information Processing Systems Conference (NIPS) 22},
       year = {2009},
      pages = {405--413},
  publisher = {},
        url = {http://books.nips.cc/papers/files/nips22/NIPS2009_1100.pdf},
source={OwnPublication},
sourcetype={Conference},
pdf={""},
}

@INPROCEEDINGS{davies+plumbley+eck:waspaa2009,
        author = {Davies, M. and Plumbley, M. and Eck, Douglas},
         title = {Towards a musical beat emphasis function},
     booktitle = {Proceedings of IEEE WASPAA},
          year = {2009},
  organization = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{Delalleau+al-2005,
     author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
     editor = {Cowell, Robert G. and Ghahramani, Zoubin},
      title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)},
       year = {2005},
      pages = {96--103},
  publisher = {Society for Artificial Intelligence and Statistics},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/semisup_aistats2005.pdf},
   abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed nonparametric algorithms which provide an estimated continuous label for the given unlabeled examples. First, it extends them to function induction algorithms that minimize a regularization criterion applied to an out-of-sample example, and happen to have the form of Parzen windows regressors. This allows to predict test labels without solving again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Second, this function induction procedure gives rise to an efficient approximation of the training process, reducing the linear system to be solved to m << n unknowns, using only a subset of m examples. An improvement of O(n^2/m^2) in time can thus be obtained. Comparative experiments are presented, showing the good performance of the induction formula and approximation algorithm.},
topics={Unsupervised},cat={C},
}

@INCOLLECTION{Delalleau+al-ssl-2006,
     author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
     editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
      title = {Large-Scale Algorithms},
  booktitle = {Semi-Supervised Learning},
       year = {2006},
      pages = {333--341},
  publisher = {{MIT} Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/delalleau_ssl.pdf},
   abstract = {In Chapter 11, it is shown how a number of graph-based semi-supervised learning
algorithms can be seen as the minimization of a specific cost function, leading to a
linear system with n equations and unknowns (with n the total number of labeled
and unlabeled examples). Solving such a linear system will in general require on the
order of O(kn2) time and O(kn) memory (for a sparse graph where each data point
has k neighbors), which can be prohibitive on large datasets (especially if k = n,
i.e. the graph is dense). We present in this chapter a subset selection method that
can be used to reduce the original system to one of size m << n. The idea is to solve
for the labels of a subset S of X of only m points, while still retaining information
from the rest of the data by approximating their label with a linear combination of
the labels in S (using the induction formula presented in Chapter 11). This leads
to an algorithm whose computational requirements scale as O(m2n) and memory
requirements as O(m2), thus allowing one to take advantage of significantly bigger
unlabeled datasets than with the original algorithms.},
cat={B},topics={Unsupervised},
}

@INCOLLECTION{DeMori90a,
     author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
     editor = {Mohr, R. and Pavlidis, T. and Sanfelin, A.},
      title = {On the use of an ear model and multi-layer networks for automatic speech recognition},
  booktitle = {Structural Pattern Analysis},
       year = {1990},
  publisher = {World Scientific},
topics={PriorKnowledge,Speech},cat={B},
}

@INPROCEEDINGS{Desjardins+al-2010,
     author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua},
      title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine},
  booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)},
     volume = {9},
       year = {2010},
      pages = {145--152},
   abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.}
}

@TECHREPORT{Desjardins-2008,
       author = {Desjardins, Guillaume and Bengio, Yoshua},
     keywords = {Convolutional Architectures, Deep Networks, RBM, Vision},
        title = {Empirical Evaluation of Convolutional RBMs for Vision},
       number = {1327},
         year = {2008},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {Convolutional Neural Networks ({CNN}) have had great success in machine learning tasks involving vision and represent one of the early successes of deep networks. Local receptive fields and weight
sharing make their architecture ideally suited for vision tasks by helping to enforce a prior based on our knowledge of natural images. This same prior could also be applied to recent developments in the field of deep networks, in order to tailor these new architectures for artificial vision. In this context, we show how the Restricted {Boltzmann} Machine (RBM), the building block of Deep Belief Networks (DBN), can be adapted to operate in a convolutional manner. We compare their performance to standard fully-connected RBMs on a simple visual learning task and show that the convolutional RBMs (CRBMs) converge to smaller values of the negative likelihood function. Our experiments also indicate that CRBMs are more efficient than standard RBMs trained on small image patches, with the CRBMs having faster convergence.}
}

@TECHREPORT{Desjardins-tech-2009,
       author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal and Delalleau, Olivier},
     keywords = {CD, PCD, RBM, simulated tempering, tempered MCMC, unsupervised learning},
        title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machines},
       number = {1345},
         year = {2009},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we
explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood that it helps both sampling and learning.}
}

@ARTICLE{Dugas+Bengio-2009,
    author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
     title = {Incorporating Functional Knowledge in Neural Networks},
   journal = {The Journal of Machine Learning Research},
    volume = {10},
      year = {2009},
     pages = {1239--1262},
  abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in its two arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of Lipschitz functions with these and other properties. We apply this new class of functions to the task of modelling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.}
}

@PHDTHESIS{Dugas-Phd-2003,
    author = {Dugas, Charles},
     title = {Les algorithmes d'apprentissage appliqu{\'{e}}s aux risques financiers},
      year = {2003},
    school = {Universit{\'{e}} de Montr{\'{e}}al}
}

@ARTICLE{dugas:2003,
    author = {Dugas, Charles and Bengio, Yoshua and Chapados, Nicolas and Vincent, Pascal and Denoncourt, Germain and Fournier, Christian},
     title = {Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking},
   journal = {CAS Forum},
    volume = {1},
    number = {1},
      year = {2003},
     pages = {179--214},
  abstract = {We recently conducted a research project for a large North American automobile insurer. This study was the most exhaustive ever undertaken by this particular insurer and lasted over an entire year. We analyzed the discriminating power of each variable used for ratemaking. We analyzed the performance of several models within five broad categories: linear regressions, generalized linear models, decision trees, neural networks and support vector machines. In this paper, we present the main results of this study. We qualitatively compare models and show how neural networks can represent high-order nonlinear dependencies with a small number of parameters, each of which is estimated on a large proportion of the data, thus yielding low variance. We thoroughly explain the purpose of the nonlinear sigmoidal transforms which are at the very heart of neural networks' performances. The main numerical result is a statistically significant reduction in the out-of-sample mean-squared error using the neural network model and our ability to substantially reduce the median premium by charging more to the highest risks. This in turn can translate into substantial savings and financial benefits for an insurer. We hope this paper goes a long way towards convincing actuaries to include neural networks within their set of modeling tools for ratemaking.},
topics={Finance,Mining},cat={J},
}

@INPROCEEDINGS{eck+bertinmahieux+lamere+green:nips2007,
    author = {Eck, Douglas and Lamere, Paul and Bertin-Mahieux, Thierry and Green, Stephen},
    editor = {Platt, John and Kolen, J. and Singer, Yoram and Roweis, S.},
     title = {Automatic Generation of Social Tags for Music Recommendation},
      year = {2008},
  crossref = {NIPS20-shorter},
source =       "OwnPublication"
}

@INPROCEEDINGS{eck+bertinmahieux+lamere:ismir2007,
     author = {Eck, Douglas and Bertin-Mahieux, Thierry and Lamere, Paul},
      title = {Autotagging music using supervised machine learning},
  booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
       year = {2007},
source={OwnPublication},
}

@INPROCEEDINGS{eck+casagrande:ismir2005,
     author = {Eck, Douglas and Casagrande, Norman},
      title = {Finding Meter in Music Using an Autocorrelation Phase Matrix and Shannon Entropy},
  booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
       year = {2005},
      pages = {504--509},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir.pdf},
source={OwnPublication},
sourcetype={Conference},
}

@INCOLLECTION{eck+gasser+port:2000,
     author = {Eck, Douglas and Gasser, M. and Port, Robert},
     editor = {Desain, P. and Windsor, L.},
      title = {Dynamics and Embodiment in Beat Induction},
  booktitle = {{Rhythm Perception and Production}},
       year = {2000},
      pages = {157--170},
  publisher = {Swets and Zeitlinger},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2000_rppw.pdf},
   abstract = {We provide an argument for using dynamical systems theory in the domain of beat induction. We motivate the study of beat induction and to relate beat induction to the more general study of human rhythm cognition. In doing so we compare a dynamical, embodied approach to a symbolic (traditional AI) one, paying particular attention to how the modeling approach brings with it tacit assumptions about what is being modeled. Please note that this is a philosophy paper about research that was, at the time of writing, very much in progress.},
source={OwnPublication},
sourcetype={Chapter},
}

@INPROCEEDINGS{eck+gasser:1996,
     author = {Eck, Douglas and Gasser, M.},
     editor = {},
      title = {Perception of Simple Rhythmic Patterns in a Network of Oscillators},
  booktitle = {{The Proceedings of the Eighteenth Annual Conference of the Cognitive Science Society}},
       year = {1996},
  publisher = {Lawrence Erlbaum Associates},
   abstract = {This paper is concerned with the complex capacity to recognize and reproduce rhythmic patterns. While this capacity has not been well investigated, in broad qualitative terms it is clear that people can learn to identify and produce recurring patterns defined in terms of sequences of beats of varying intensity and rests: the rhythms behind waltzes, reels, sambas, etc. Our short term goal is a model which is "hard-wired" with knowledge of a set of such patterns. Presented with a portion of one of the patterns or a label for a pattern, the model should reproduce the pattern and continue to do so when the input is turned off. Our long-term goal is a model which can learn to adjust the connection strengths which implement particular patterns as it is exposed to input patterns.},
source={OwnPublication},
sourcetype={Conference},
}

@TECHREPORT{eck+graves+schmidhuber:tr-speech2003,
       author = {Eck, Douglas and Graves, A. and Schmidhuber, Juergen},
        title = {A New Approach to Continuous Speech Recognition Using {LSTM} Recurrent Neural Networks},
       number = {IDSIA-14-03},
         year = {2003},
  institution = {IDSIA},
     abstract = {This paper presents an algorithm for continuous speech recognition built from two Long Short-Term Memory ({LSTM}) recurrent neural networks. A first {LSTM} network performs frame-level phone probability estimation. A second network maps these phone predictions onto words. In contrast to {HMM}s, this allows greater exploitation of long-timescale correlations. Simulation results are presented for a hand-segmented subset of the "Numbers-95" database. These results include isolated phone prediction, continuous frame-level phone prediction and continuous word prediction. We conclude that despite its early stage of development, our new model is already competitive with existing approaches on certain aspects of speech recognition and promising on others, warranting further research.},
source={OwnPublication},
sourcetype={TechReport},
}

@TECHREPORT{eck+lapalme:2008,
       author = {Eck, Douglas and Lapalme, J.},
        title = {Learning Musical Structure Directly from Sequences of Music},
       number = {1300},
         year = {2008},
  institution = {Universit{\'{e}} de Montr{\'{e}}al DIRO},
          url = {http://www.iro.umontreal.ca/~eckdoug/papers/tr1300.pdf},
source={OwnPublication},
sourcetype={TechReport},
}

@INPROCEEDINGS{eck+schmidhuber:icann2002,
     author = {Eck, Douglas and Schmidhuber, Juergen},
     editor = {Dorronsoro, J.},
      title = {Learning The Long-Term Structure of the Blues},
  booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
     volume = {},
       year = {2002},
      pages = {284--289},
  publisher = {Springer},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_icannMusic.pdf},
   abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, they have been unable to learn an entire musical form and use that knowledge to guide composition. In this study, we describe model details and present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and some listeners believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{eck+schmidhuber:ieee2002,
     author = {Eck, Douglas and Schmidhuber, Juergen},
     editor = {Bourlard, H.},
      title = {Finding Temporal Structure in Music: Blues Improvisation with {LSTM} Recurrent Networks},
  booktitle = {Neural Networks for Signal Processing XII, Proceedings of the 2002 IEEE Workshop},
       year = {2002},
      pages = {747--756},
  publisher = {IEEE},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_ieee.pdf},
   abstract = {Few types of signal streams are as ubiquitous as music. Here we consider the problem of extracting essential ingredients of music signals, such as well-defined global temporal structure in the form of nested periodicities (or {\em meter}). Can we construct an adaptive signal processing device that learns by example how to generate new instances of a given musical style? Because recurrent neural networks can in principle learn the temporal structure of a signal, they are good candidates for such a task. Unfortunately, music composed by standard recurrent neural networks ({RNN}s) often lacks global coherence. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and learning of context sensitive languages. In the current study we show that {LSTM} is also a good mechanism for learning to compose music. We present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and we believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
source={OwnPublication},
sourcetype={Conference},
}

@ARTICLE{eck+scott:2005,
    author = {Eck, Douglas and Scott, S. K.},
     title = {Editorial: New Research in Rhythm Perception and Production},
   journal = {Music Perception},
    volume = {22},
    number = {3},
      year = {2005},
     pages = {371-388},
source={OwnPublication},
sourcetype={Other},
}

@MISC{eck+scott:editor2005,
    author = {Eck, Douglas and Scott, S. K.},
     title = {Music Perception},
      year = {2005},
      note = {Guest Editor, Special Issue on Rhythm Perception and Production, 22(3)},
source={OwnPublication},
sourcetype={Other},
}

@INPROCEEDINGS{eck:1999,
     author = {Eck, Douglas},
     editor = {},
      title = {Learning Simple Metrical Preferences in a Network of {F}itzhugh-{N}agumo Oscillators},
  booktitle = {{The Proceedings of the Twenty-First Annual Conference of the Cognitive Science Society}},
       year = {1999},
  publisher = {Lawrence Erlbaum Associates},
   abstract = {Hebbian learning is used to train a network of oscillators to prefer periodic signals of pulses over aperiodic signals. Target signals consisted of metronome-like voltage pulses with varying amounts of inter-onset noise injected. (with 0\% noise yielding a periodic signal and more noise yielding more and more aperiodic signals.) The oscillators---piecewise-linear approximations (Abbott, 1990) to Fitzhugh-Nagumo oscillators---are trained using mean phase coherence as an objective function. Before training a network is shown to readily synchronize with signals having wide range of noise. After training on a series of noise-free signals, a network is shown to only synchronize with signals having little or no noise. This represents a bias towards periodicity and is explained by strong positive coupling connections between oscillators having harmonically-related periods.},
source={OwnPublication},
sourcetype={Conference},
}

@UNPUBLISHED{eck:bramsworkshop2004,
    author = {Eck, Douglas},
     title = {Challenges for Machine Learning in the Domain of Music},
      year = {2004},
      note = {BRAMS Workshop on Brain and Music, Montreal Neurological Institute},
  abstract = {Slides and musical examples available on request.},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@PHDTHESIS{eck:diss,
    author = {Eck, Douglas},
     title = {{Meter Through Synchrony: Processing Rhythmical Patterns with Relaxation Oscillators}},
      year = {2000},
    school = {Indiana University, Bloomington, IN, www.idsia.ch/\-\~{}doug/\-publications.html},
  abstract = {This dissertation uses a network of relaxation oscillators to beat along with temporal signals. Relaxation oscillators exhibit interspersed slow-fast movement and model a wide array of biological oscillations. The model is built up gradually: first a single relaxation oscillator is exposed to rhythms and shown to be good at finding downbeats in them. Then large networks of oscillators are mutually coupled in an exploration of their internal synchronization behavior. It is demonstrated that appropriate weights on coupling connections cause a network to form multiple pools of oscillators having stable phase relationships. This is a promising first step towards networks that can recreate a rhythmical pattern from memory. In the full model, a coupled network of relaxation oscillators is exposed to rhythmical patterns. It is shown that the network finds downbeats in patterns while continuing to exhibit good internal stability. A novel non-dynamical model of downbeat induction called the Normalized Positive (NP) clock model is proposed, analyzed, and used to generate comparison predictions for the oscillator model. The oscillator model compares favorably to other dynamical approaches to beat induction such as adaptive oscillators. However, the relaxation oscillator model takes advantage of intrinsic synchronization stability to allow the creation of large coupled networks. This research lays the groundwork for a long-term research goal, a robotic arm that responds to rhythmical signals by tapping along. It also opens the door to future work in connectionist learning of long rhythmical patterns.},
source={OwnPublication},
sourcetype={Thesis},
}

@INPROCEEDINGS{eck:icann2001,
     author = {Eck, Douglas},
     editor = {Dorffner, Georg},
      title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
  booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
     volume = {},
       year = {2001},
      pages = {1239--1247},
  publisher = {Springer},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_icann.pdf},
   abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators.},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{eck:icassp2007,
     author = {Eck, Douglas},
     editor = {},
      title = {Beat Tracking Using an Autocorrelation Phase Matrix},
  booktitle = {{Proceedings of the 2007 International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
       year = {2007},
      pages = {1313--1316},
  publisher = {IEEE Signal Processing Society},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_icassp.pdf},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{eck:icmpc2004,
     author = {Eck, Douglas},
     editor = {Lipscomb, S. D. and Ashley, R. and Gjerdingen, R. O. and Webster, P.},
      title = {A Machine-Learning Approach to Musical Sequence Induction That Uses Autocorrelation to Bridge Long Timelags},
  booktitle = {{The Proceedings of the Eighth International Conference on Music Perception and Cognition ({ICMPC}8)}},
       year = {2004},
      pages = {542-543},
  publisher = {Causal Productions},
   abstract = {One major challenge in using statistical sequence learning methods in the domain of music lies in bridging the long timelags that separate important musical events. Consider, for example, the chord changes that convey the basic structure of a pop song. A sequence learner that cannot predict chord changes will almost certainly not be able to generate new examples in a musical style or to categorize songs by style. Yet, it is surprisingly difficult for a sequence learner to bridge the long timelags necessary to identify when a chord change will occur and what its new value will be. This is the case because chord changes can be separated by dozens or hundreds of intervening notes. One could solve this problem by treating chords as being special (as did Mozer, NIPS 1991). But this is impractical---it requires chords to be labeled specially in the dataset, limiting the applicability of the model to non-labeled examples---and furthermore does not address the general issue of nested temporal structure in music. I will briefly describe this temporal structure (known commonly as "meter") and present a model that uses to its advantage an assumption that sequences are metrical. The model consists of an autocorrelation-based filtration that estimates online the most likely metrical tree (i.e. the frequency and phase of beat, measure, phrase &etc.) and uses that to generate a series of sequences varying at different rates. These sequences correspond to each level in the hierarchy. Multiple learners can be used to treat each series separately and their predictions can be combined to perform composition and categorization. I will present preliminary results that demonstrate the usefulness of this approach. Time permitting I will also compare the model to alternate approaches.},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{eck:icmpc2006,
     author = {Eck, Douglas},
     editor = {Baroni, M. and Addessi, A. R. and Caterina, R. and Costa, M.},
      title = {Beat Induction Using an Autocorrelation Phase Matrix},
  booktitle = {The Proceedings of the 9th International Conference on Music Perception and Cognition ({ICMPC9})},
       year = {2006},
      pages = {931-932},
  publisher = {Causal Productions},
source={OwnPublication},
sourcetype={Conference},
}

@UNPUBLISHED{eck:irisworkshop2004,
    author = {Eck, Douglas},
     title = {Using Autocorrelation to Bridge Long Timelags when Learning Sequences of Music},
      year = {2004},
      note = {IRIS 2004 Machine Learning Workshop, Ottawa, Canada},
  abstract = {Slides and musical examples available on request.},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@ARTICLE{eck:jnmr2001,
    author = {Eck, Douglas},
     title = {A Positive-Evidence Model for Rhythmical Beat Induction},
   journal = {Journal of New Music Research},
    volume = {30},
    number = {2},
      year = {2001},
     pages = {187--200},
  abstract = {The Normalized Positive (NPOS) model is a rule-based model that predicts downbeat location and pattern complexity in rhythmical patterns. Though derived from several existing models, the NPOS model is particularly effective at making correct predictions while at the same time having low complexity. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on beat induction.},
source={OwnPublication},
sourcetype={Journal},
}

@UNPUBLISHED{eck:mipsworkshop2004,
    author = {Eck, Douglas},
     title = {Bridging Long Timelags in Music},
      year = {2004},
      note = {NIPS 2004 Workshop on Music and Machine Learning (MIPS), Whistler, British Columbia},
  abstract = {Slides and musical examples available on request.},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@ARTICLE{eck:mp2006,
    author = {Eck, Douglas},
     title = {Finding Long-Timescale Musical Structure with an Autocorrelation Phase Matrix},
   journal = {Music Perception},
    volume = {24},
    number = {2},
      year = {2006},
     pages = {167--176},
source={OwnPublication},
sourcetype={Journal},
}

@UNPUBLISHED{eck:nipsworkshop2003,
    author = {Eck, Douglas},
     title = {Time-warped hierarchical structure in music and speech: A sequence prediction challenge},
      year = {2003},
      note = {NIPS 2003 Workshop on Recurrent Neural Networks, Whistler, British Columbia},
  abstract = {Slides and musical examples available on request.},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@UNPUBLISHED{eck:nipsworkshop2006,
    author = {Eck, Douglas},
     title = {Generating music sequences with an echo state network},
      year = {2006},
      note = {NIPS 2006 Workshop on Echo State Networks and Liquid State Machines},
  abstract = {Slides and musical examples available on request.},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@UNPUBLISHED{eck:nipsworkshop2007,
    author = {Eck, Douglas},
     title = {Measuring and modeling musical expression},
      year = {2007},
      note = {NIPS 2007 Workshop on Music, Brain and Cognition},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@ARTICLE{eck:psyres2002,
    author = {Eck, Douglas},
     title = {Finding Downbeats with a Relaxation Oscillator},
   journal = {Psychol. Research},
    volume = {66},
    number = {1},
      year = {2002},
     pages = {18--25},
  abstract = {A relaxation oscillator model of neural spiking dynamics is applied to the task of finding downbeats in rhythmical patterns. The importance of downbeat discovery or {\em beat induction} is discussed, and the relaxation oscillator model is compared to other oscillator models. In a set of computer simulations the model is tested on 35 rhythmical patterns from Povel \& Essens (1985). The model performs well, making good predictions in 34 of 35 cases. In an analysis we identify some shortcomings of the model and relate model behavior to dynamical properties of relaxation oscillators.},
source={OwnPublication},
sourcetype={Journal},
}

@UNPUBLISHED{eck:rppw2005,
    author = {Eck, Douglas},
     title = {Meter and Autocorrelation},
      year = {2005},
      note = {{10th Rhythm Perception and Production Workshop (RPPW), Alden Biesen, Belgium}},
source={OwnPublication},
sourcetype={Workshop},
}

@TECHREPORT{eck:tr-music2002,
       author = {Eck, Douglas and Schmidhuber, Juergen},
        title = {A First Look at Music Composition using {LSTM} Recurrent Neural Networks},
       number = {IDSIA-07-02},
         year = {2002},
  institution = {IDSIA},
     abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, attempts at learning an entire musical form and using that knowledge to guide composition have been unsuccessful. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and CSL learning. In the current study I show that {LSTM} is also a good mechanism for learning to compose music. I compare this approach to previous attempts, with particular focus on issues of data representation. I present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and I believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen. {\em Note: This is a more complete version of the 2002 ICANN submission Learning the Long-Term Structure of the Blues.}},
source={OwnPublication},
sourcetype={TechReport},
}

@TECHREPORT{eck:tr-npos2000,
       author = {Eck, Douglas},
        title = {A Positive-Evidence Model for Classifying Rhythmical Patterns},
       number = {IDSIA-09-00},
         year = {2000},
  institution = {IDSIA},
     abstract = {The Normalized Positive (NPOS) model is a novel matching model that predicts downbeat location and pattern complexity in rhythmical patterns. Though similar models report success, the NPOS model is particularly effective at making these predictions while at the same time being theoretically and mathematically simple. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on rhythm perception.\\ {\em Note: See the 2001 Journal of New Music Research paper "A Positive-Evidence Model for Rhythmical Beat Induction" for a newer version of this paper.}},
ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-09-00.ps.gz},
source={OwnPublication},
sourcetype={TechReport},
}

@TECHREPORT{eck:tr-oscnet2001,
       author = {Eck, Douglas},
        title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
       number = {IDSIA-06-01},
         year = {2001},
  institution = {IDSIA},
     abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators. \\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-06-01.ps.gz},
source={OwnPublication},
sourcetype={TechReport},
}

@TECHREPORT{eck:tr-tracking2000,
       author = {Eck, Douglas},
        title = {Tracking Rhythms with a Relaxation Oscillator},
       number = {IDSIA-10-00},
         year = {2000},
  institution = {IDSIA},
     abstract = {A number of biological and mechanical processes are typified by a continued slow accrual and fast release of energy. A nonlinear oscillator exhibiting this slow-fast behavior is called a relaxation oscillator and is used to model, for example, human heartbeat pacemaking and neural action potential. Similar limit cycle oscillators are used to model a wider range of behaviors including predator-prey relationships and synchrony in animal populations such as fireflies. Though nonlinear limit-cycle oscillators have been successfully applied to beat induction, relaxation oscillators have received less attention. In this work we offer a novel and effective relaxation oscillator model of beat induction. We outline the model in detail and provide a perturbation analysis of its response to external stimuli. In a series of simulations we expose the model to patterns from Experiment 1 of Povel \& Essens (1985). We then examine the beat assignments of the model. Although the overall performance of the model is very good, there are shortcomings. We believe that a network of mutually-coupled oscillators will address many of these shortcomings, and we suggest an appropriate course for future research.\\ {\em Note: See the 2001 {\em Psychological Research} article "Finding Downbeats with a Relaxation Oscillator" for a revised but less detailed version of this paper.}},
ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-10-00.ps.gz},
source={OwnPublication},
sourcetype={TechReport},
}

@TECHREPORT{eck:tr-tracking2002,
       author = {Eck, Douglas},
        title = {Real-Time Musical Beat Induction with Spiking Neural Networks},
       number = {IDSIA-22-02},
         year = {2002},
  institution = {IDSIA},
     abstract = {Beat induction is best described by analogy to the activities of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to perform automatically. We will present a model of beat induction that uses a spiking neural network as the underlying synchronization mechanism. This approach has some advantages over existing methods; it runs online, responds at many levels in the metrical hierarchy, and produces good results on performed music (Beatles piano performances encoded as MIDI). In this paper the model is described in some detail and simulation results are discussed.},
source={OwnPublication},
sourcetype={TechReport},
}

@UNPUBLISHED{eck:verita2002,
    author = {Eck, Douglas},
     title = {Real Time Beat Induction with Spiking Neurons},
      year = {2002},
      note = {{Music, Motor Control and the Mind: Symposium at Monte Verita, May}},
  abstract = {Beat induction is best described by analogy to the activites of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to model. I will discuss an approach to beat induction that uses a network of spiking neurons to synchronize with periodic components in a signal at many timescales. Through a competitive process, groups of oscillators embodying a particular metrical interpretation (e.g. \"4/4\") are selected from the network and used to track the pattern. I will compare this model to other approaches including a traditional symbolic AI system (Dixon 2001), and one based on Bayesian statistics (Cemgil et al, 2001). Finally I will present performance results of the network on a set of MIDI-recorded piano performances of Beatles songs collected by the Music, Mind, Machine Group, NICI, University of Nijmegen (see Cemgil et al, 2001 for more details or http://www.nici.kun.nl/mmm).},
source={OwnPublication},
sourcetype={Workshop},
}

@INPROCEEDINGS{ElHihi+Bengio-nips8,
    author = {El Hihi, Salah and Bengio, Yoshua},
     title = {Hierarchical Recurrent Neural Networks for Long-Term Dependencies},
      year = {1996},
  crossref = {NIPS8-shorter},
  abstract = {We have already shown that extracting lone-term dependencies from sequential data is difficult, both for deterministic dynamical systems such as recurrent networks, and probabilistic models such as hidden {Markov} models ({HMM}s) or input/output hidden {Markov} models ({IOHMM}s). In practice, to avoid this problem, researchers have used domain specific a-priori knowledge to give meaning to the hidden or state variables representing past context. In this paper we propose to use a more general type of a-priori knowledge, namely that the temporal dependencies are structured hierarchically. This implies that long-term dependencies are represented by variables with a long time scale. This principle is applied to a recurrent network which includes delays and multiple time scales. Experiments confirm the advantages of such structures. A similar approach is proposed for {HMM}s and {IOHMM}s.},
topics={LongTerm},cat={C},
}

@ARTICLE{Erhan+al-2010,
    author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
     title = {Why Does Unsupervised Pre-training Help Deep Learning?},
    volume = {11},
      year = {2010},
     pages = {625--660},
  crossref = {JMLR-shorter},
  abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
}

@INPROCEEDINGS{Erhan-aistats-2010,
     author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
      title = {Why Does Unsupervised Pre-training Help Deep Learning?},
  booktitle = {Proceedings of AISTATS 2010},
     volume = {9},
       year = {2010},
      pages = {201-208},
   abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants with impressive results being obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks often involve an unsupervised learning component, usually in an unsupervised pre-training phase. The main question investigated here is the following: why does unsupervised pre-training work so well? Through extensive experimentation, we explore several possible explanations discussed in the literature including its action as a regularizer (Erhan et al. 2009) and as an aid to optimization (Bengio et al. 2007). Our results build on the work of Erhan et al. 2009, showing that unsupervised pre-training appears to play predominantly a regularization role in subsequent supervised training. However our results in an online setting, with a virtually unlimited data stream, point to a somewhat more nuanced interpretation of the roles of optimization and regularization in the unsupervised pre-training effect.}
}

@MASTERSTHESIS{Erhan-MSc,
    author = {Erhan, Dumitru},
  keywords = {Apprentisage multit{\^{a}}che, Filtrage collaboratif, M{\'{e}}thodes {\`{a}} noyaux, QSAR, R{\'{e}}seaux de neurones},
     title = {Collaborative filtering techniques for drug discovery},
      year = {2006},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {Cette th{\`{e}}se examine le probl{\`{e}}me d'apprendre plusieurs t{\^{a}}ches simultan{\'{e}}ment,
afin de transf{\'{e}}rer les connaissances apprises {\`{a}} une nouvelle t{\^{a}}che. Si
on suppose que les t{\^{a}}ches partagent une repr{\'{e}}sentation et qu'il est possible de
d{\'{e}}couvrir cette repr{\'{e}}sentation efficacement, cela peut nous servir {\`{a}} construire un
meilleur mod{\`{e}}le de la nouvelle t{\^{a}}che. Il existe plusieurs variantes de
cette m{\'{e}}thode: transfert inductif, apprentisage multit{\^{a}}che, filtrage
collaboratif etc. Nous avons {\'{e}}valu{\'{e}} plusieurs algorithmes d'apprentisage
supervis{\'{e}} pour d{\'{e}}couvrir des repr{\'{e}}sentations partag{\'{e}}es parmi les
t{\^{a}}ches d{\'{e}}finies dans un probl{\`{e}}me de chimie computationelle. Nous avons
formul{\'{e}} le probl{\`{e}}me dans un cadre d'apprentisage automatique,
fait l'analogie avec les algorithmes standards de filtrage collaboratif et construit les
hypoth{\`{e}}ses g{\'{e}}n{\'{e}}rales qui devraient {\^{e}}tre test{\'{e}}es pour valider l'utilitisation des
algorithmes multit{\^{a}}che. Nous avons aussi {\'{e}}valu{\'{e}} la performance des algorithmes
d'apprentisage utilis{\'{e}}s et d{\'{e}}montrons qu'il est, en effet, possible de trouver une
repr{\'{e}}sentation partag{\'{e}}e pour le probl{\`{e}}me consider{\'{e}}. Du point de vue
th{\'{e}}orique, notre apport est une modification d'un algorithme
standard---les machines {\`{a}} vecteurs de support--qui produit des r{\'{e}}sultats
comparables aux meilleurs algorithmes disponsibles et qui utilise {\`{a}} fond les
concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre
apport est l'utilisation de notre algorithme par les compagnies
pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.}
}

@TECHREPORT{Erhan-vis-techreport-2010,
       author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua},
        title = {Understanding Representations Learned in Deep Architectures},
       number = {1355},
         year = {2010},
  institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO},
     abstract = {Deep architectures have demonstrated state-of-the-art performance in a variety of
settings, especially with vision datasets. Deep learning algorithms are based on learning
several levels of representation of the input. Beyond test-set performance, there
is a need for qualitative comparisons of the solutions learned by various deep architectures,
focused on those learned representations. One of the goals of our research
is to improve tools for finding good qualitative interpretations of high level features
learned by such models. We also seek to gain insight into the invariances learned by
deep networks. To this end, we contrast and compare several techniques for finding
such interpretations. We applied our techniques on Stacked Denoising Auto-Encoders
and Deep Belief Networks, trained on several vision datasets. We show that consistent
filter-like interpretation is possible and simple to accomplish at the unit level. The tools
developed make it possible to analyze deep models in more depth and accomplish the
tracing of invariance manifolds for each of the hidden units. We hope that such techniques
will allow researchers in deep architectures to understand more of how and why
deep architectures work.}
}

@INPROCEEDINGS{Erhan2009,
    author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
  keywords = {Deep Networks},
     title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training},
      year = {2009},
     pages = {153--160},
  crossref = {xAISTATS2009-shorter},
  abstract = {Whereas theoretical work suggests that deep architectures might be more efficient at representing highly-varying functions, training deep architectures was unsuccessful until the recent advent of algorithms based on unsupervised pretraining. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. Answering these questions is important if learning in deep architectures is to be further improved. We attempt to shed some light on these questions through extensive simulations. The experiments confirm and clarify the advantage of unsupervised pre-training. They demonstrate the robustness of the training procedure with respect to the random initialization, the positive effect of pre-training in terms of optimization and its role as a regularizer. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples.}
}

@ARTICLE{gasser+eck+port:1999,
    author = {Gasser, M. and Eck, Douglas and Port, Robert},
     title = {Meter as Mechanism: A Neural Network Model that Learns Metrical patterns},
   journal = {Connection Science},
    volume = {11},
    number = {2},
      year = {1999},
     pages = {187--216},
  abstract = {One kind of prosodic structure that apparently underlies both music and some examples of speech production is meter. Yet detailed measurements of the timing of both music and speech show that the nested periodicities that define metrical structure can be quite noisy in time. What kind of system could produce or perceive such variable metrical timing patterns? And what would it take to be able to store and reproduce particular metrical patterns from long-term memory? We have developed a network of coupled oscillators that both produces and perceives patterns of pulses that conform to particular meters. In addition, beginning with an initial state with no biases, it can learn to prefer the particular meter that it has been previously exposed to.},
own={Have},
source={OwnPublication},
sourcetype={Journal},
}

@TECHREPORT{gasser+eck+port:tr-1996,
       author = {Gasser, M. and Eck, Douglas and Port, Robert},
        title = {Meter as Mechanism A Neural Network that Learns Metrical Patterns},
       number = {180},
         year = {1996},
  institution = {Indiana University Cognitive Science Program},
source={OwnPublication},
sourcetype={TechReport},
}

@INPROCEEDINGS{gasser+eck:1996,
     author = {Gasser, M. and Eck, Douglas},
     editor = {},
      title = {Representing Rhythmic Patterns in a Network of Oscillators},
  booktitle = {{The Proceedings of the International Conference on Music Perception and Cognition}},
     number = {4},
       year = {1996},
      pages = {361--366},
  publisher = {Lawrence Erlbaum Associates},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/1996_gasser_icmpc.pdf},
   abstract = {This paper describes an evolving computational model of the perception and pro-duction of simple rhythmic patterns. The model consists of a network of oscillators of different resting frequencies which couple with input patterns and with each other. Os-cillators whose frequencies match periodicities in the input tend to become activated. Metrical structure is represented explicitly in the network in the form of clusters of os-cillators whose frequencies and phase angles are constrained to maintain the harmonic relationships that characterize meter. Rests in rhythmic patterns are represented by ex-plicit rest oscillators in the network, which become activated when an expected beat in the pattern fails to appear. The model makes predictions about the relative difficulty of patterns and the effect of deviations from periodicity in the input.},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{gers+eck+schmidhuber:icann2001,
     author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
     editor = {Dorffner, Georg},
      title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
  booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
       year = {2001},
      pages = {669--676},
  publisher = {Springer},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_gers_icann.pdf},
   abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.},
source={OwnPublication},
sourcetype={Conference},
}

@TECHREPORT{gers+eck+schmidhuber:tr-2000,
       author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
        title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
       number = {IDSIA-22-00},
         year = {2000},
  institution = {IDSIA},
     abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.\\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-22-00.ps.gz},
source={OwnPublication},
sourcetype={TechReport},
}

@INPROCEEDINGS{gers+perez+eck+schmidhuber:esann2002,
     author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
      title = {{DEKF-LSTM}},
  booktitle = {Proceedings of the 10th European Symposium on Artificial Neural Networks, ESANN 2002},
       year = {2002},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{gers+perez+eck+schmidhuber:icannA2002,
     author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
     editor = {Dorronsoro, J.},
      title = {Learning Context Sensitive Languages with {LSTM} Trained with {Kalman} Filters},
  booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
       year = {2002},
      pages = {655--660},
  publisher = {Springer},
   abstract = {Unlike traditional recurrent neural networks, the Long Short-Term Memory ({LSTM}) model generalizes well when presented with training sequences derived from regular and also simple nonregular languages. Our novel combination of {LSTM} and the decoupled extended Kalman filter, however, learns even faster and generalizes even better, requiring only the 10 shortest exemplars n <= 10 of the context sensitive language a^nb^nc^n to deal correctly with values of n up to 1000 and more. Even when we consider the relatively high update complexity per timestep, in many cases the hybrid offers faster learning than {LSTM} by itself.},
source={OwnPublication},
sourcetype={Conference},
}

@PHDTHESIS{Ghosn-Phd-2003,
    author = {Ghosn, Joumana},
     title = {Apprentissage multi-t{\^{a}}ches et partage de connaissances},
      year = {2003},
    school = {Universit{\'{e}} de Montr{\'{e}}al}
}

@INPROCEEDINGS{ghosn97,
     author = {Ghosn, Joumana and Bengio, Yoshua},
      title = {Multi-Task Learning for Stock Selection},
       year = {1997},
      pages = {946--952},
  publisher = {MIT Press, Cambridge, MA},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/multitask-nips97.pdf},
   crossref = {NIPS9},
   abstract = {Artificial Neural Networks can be used to predict future returns of stocks in order to take financial decisions. Should one build a separate network for each stock or share the same network for all the stocks. In this paper we also explore other alternatives, in which some layers are shared and others are not shared. When the prediction of future returns for different stocks are viewed as different tasks, sharing some parameters across stocks is a form of multi-task learning. In a series of experiments with Canadian stocks, we obtain yearly returns that are more than 14\% above various benchmarks.},
topics={MultiTask,Finance},cat={C},
}

@TECHREPORT{Gingras-asynchronous-TR96,
       author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
        title = {Handling asynchronous or missing financial data with recurrent networks},
       number = {1020},
         year = {1996},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
topics={Finance,Missing},cat={T},
}

@TECHREPORT{Gingras-financial-TR99,
       author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
        title = {On Out-of-Sample Statistics for Financial Time-Series},
       number = {2585},
         year = {1999},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
topics={Comparative,Finance},cat={T},
}

@INPROCEEDINGS{gingras2000,
     author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
      title = {On Out-of-Sample Statistics for Time-Series},
  booktitle = {Computational Finance 2000},
       year = {2000},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/out-err-cf2000.pdf},
   abstract = {This paper studies an out-of-sample statistic for time-series prediction that is analogous to the widely used R2 in-sample statistic. We propose and study methods to estimate the variance of this out-of-sample statistic. We suggest that the out-of-sample statistic is more robust to distributional and asymptotic assumptions behind many tests for in-sample statistics. Furthermore we argue that it may be more important in some cases to choose a model that generalizes as well as possible rather than choose the parameters that are closest to the true parameters. Comparative experiments are performed on a financial time-series (daily and monthly returns of the TSE300 index). The experiments are performed or varying prediction horizons and we study the relation between predictibility (out-of-sample R2), variability of the out-of-sample R2 statistic, and the prediction horizon.},
topics={Comparative,Finance},cat={C},
}

@INPROCEEDINGS{GlorotAISTATS2010,
     author = {Bengio, Yoshua and Glorot, Xavier},
      title = {Understanding the difficulty of training deep feedforward neural networks},
  booktitle = {Proceedings of AISTATS 2010},
     volume = {9},
       year = {2010},
      pages = {249-256},
   abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}
}

@INPROCEEDINGS{Gori89,
     author = {Gori, Marco and Bengio, Yoshua and De Mori, Renato},
      title = {BPS: a learning algorithm for capturing the dynamic nature of speech},
  booktitle = {International Joint Conference on Neural Networks},
     volume = {2},
       year = {1989},
      pages = {417--424},
  publisher = {IEEE, New York},
topics={Speech},cat={C},
}

@INCOLLECTION{Grandvalet+Bengio-ssl-2006,
     author = {Grandvalet, Yves and Bengio, Yoshua},
     editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
      title = {Entropy Regularization},
  booktitle = {Semi-Supervised Learning},
       year = {2006},
      pages = {151--168},
  publisher = {{MIT} Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/entropy_regularization_2006.pdf},
   abstract = {The problem of semi-supervised induction consists in learning a decision rule from
labeled and unlabeled data. This task can be undertaken by discriminative methods,
provided that learning criteria are adapted consequently. In this chapter, we motivate the use of entropy regularization as a means to benefit from unlabeled data in
the framework of maximum a posteriori estimation. The learning criterion is derived
from clearly stated assumptions and can be applied to any smoothly parametrized
model of posterior probabilities. The regularization scheme favors low density separation, without any modeling of the density of input features. The contribution
of unlabeled data to the learning criterion induces local optima, but this problem
can be alleviated by deterministic annealing. For well-behaved models of posterior
probabilities, deterministic annealing {EM} provides a decomposition of the learning
problem in a series of concave subproblems. Other approaches to the semi-supervised
problem are shown to be close relatives or limiting cases of entropy regularization.
A series of experiments illustrates the good behavior of the algorithm in terms of
performance and robustness with respect to the violation of the postulated low density separation assumption. The minimum entropy solution benefits from unlabeled
data and is able to challenge mixture models and manifold learning in a number of
situations.},
cat={B},topics={Unsupervised},
}

@INPROCEEDINGS{graves+eck+schmidhuber:bio-adit2004,
     author = {Graves, A. and Eck, Douglas and Beringer, N. and Schmidhuber, Juergen},
      title = {Biologically Plausible Speech Recognition with {LSTM} Neural Nets},
  booktitle = {Proceedings of the First Int'l Workshop on Biologically Inspired Approaches to Advanced Information Technology (Bio-ADIT)},
       year = {2004},
      pages = {127-136},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2004_bioadit.pdf},
   abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) are local in space and time and closely related to a biological model of memory in the prefrontal cortex. Not only are they more biologically plausible than previous artificial {RNN}s, they also outperformed them on many artificially generated sequential processing tasks. This encouraged us to apply {LSTM} to more realistic problems, such as the recognition of spoken digits. Without any modification of the underlying algorithm, we achieved results comparable to state-of-the-art Hidden {Markov} Model ({HMM}) based recognisers on both the {TIDIGITS} and TI46 speech corpora. We conclude that {LSTM} should be further investigated as a biologically plausible basis for a bottom-up, neural net-based approach to speech recognition.},
source={OwnPublication},
sourcetype={Conference},
}

@TECHREPORT{graves+eck+schmidhuber:tr-digits2003,
       author = {Graves, A. and Eck, Douglas and Schmidhuber, Juergen},
        title = {Comparing {LSTM} Recurrent Networks and Spiking Recurrent Networks on the Recognition of Spoken Digits},
       number = {IDSIA-13-03},
         year = {2003},
  institution = {IDSIA},
     abstract = {One advantage of spiking recurrent neural networks ({SNN}s) is an ability to categorise data using a synchrony-based latching mechnanism. This is particularly useful in problems where timewarping is encountered, such as speech recognition. Differentiable recurrent neural networks ({RNN}s) by contrast fail at tasks involving difficult timewarping, despite having sequence learning capabilities superior to {SNN}s. In this paper we demonstrate that Long Short-Term Memory ({LSTM}) is an {RNN} capable of robustly categorizing timewarped speech data, thus combining the most useful features of both paradigms. We compare its performance to {SNN}s on two variants of a spoken digit identification task, using data from an international competition. The first task (described in Nature (Nadis 2003)) required the categorisation of spoken digits with only a single training exemplar, and was specifically designed to test robustness to timewarping. Here {LSTM} performed better than all the {SNN}s in the competition. The second task was to predict spoken digits using a larger training set. Here {LSTM} greatly outperformed an {SNN}-like model found in the literature. These results suggest that {LSTM} has a place in domains that require the learning of large timewarped datasets, such as automatic speech recognition.},
source={OwnPublication},
sourcetype={TechReport},
}

@INPROCEEDINGS{haffner-98,
     author = {Haffner, Patrick and Bottou, {L{\'{e}}on} and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
      title = {Browsing through High Quality Document Images with {DjVu}},
  booktitle = {Proc. of Advances in Digital Libraries 98},
       year = {1998},
      pages = {309--318},
  publisher = {IEEE},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/haffner-98.ps.gz},
topics={HighDimensional},cat={C},
}

@INPROCEEDINGS{Hamel+al-2009,
     author = {Hamel, Philippe and Wood, Sean and Eck, Douglas},
      title = {Automatic Identification of Instrument Classes in Polyphonic and Poly-Instrument Audio},
  booktitle = {10th International Society for Music Information Retrieval Conference},
       year = {2009},
      pages = {399--404},
        url = {http://ismir2009.ismir.net/proceedings/PS3-2.pdf},
   abstract = {We present and compare several models for automatic identification of instrument classes in polyphonic and poly-instrument audio. The goal is to be able to identify which categories of instrument (Strings, Woodwind, Guitar, Piano, etc.) are present in a given audio example. We use a machine learning approach to solve this task. We constructed a system to generate a large database of musically relevant poly-instrument audio. Our database is generated from hundreds of instruments classified in 7 categories. Musical audio examples are generated by mixing multi-track MIDI files with thousands of instrument combinations. We compare three different classifiers : a Support Vector Machine ({SVM}), a Multilayer Perceptron (MLP) and a Deep Belief Network (DBN). We show that the DBN tends to outperform both the {SVM} and the MLP in most cases.}
}

@MISC{Hugo+al-snowbird-2007,
        author = {Larochelle, Hugo and Bengio, Yoshua and Erhan, Dumitru},
         title = {Generalization to a zero-data task: an empirical study},
          year = {2007},
  howpublished = {Talk and poster presented at the Learning Workshop(Snowbird), San Juan, Puerto Rico, 2007}
}

@INPROCEEDINGS{hyper:2000:ijcnn,
     author = {Bengio, Yoshua},
      title = {Continuous Optimization of Hyper-Parameters},
  booktitle = {International Joint Conference on Neural Networks 2000},
     volume = {I},
       year = {2000},
      pages = {305--310},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyper-ijcnn2000.pdf},
   abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
topics={ModelSelection},cat={C},
}

@INPROCEEDINGS{ICML01,
     editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
      title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
  booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{ICML01-short,
     editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
      title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
  booktitle = {ICML'01},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{ICML02,
     editor = {Sammut, Claude and Hoffmann, Achim G.},
      title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
  booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{ICML02-short,
     editor = {Sammut, Claude and Hoffmann, Achim G.},
      title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
  booktitle = {ICML'02},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{ICML03,
     editor = {Fawcett, Tom and Mishra, Nina},
      title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
  booktitle = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
       year = {-1},
  publisher = {AAAI Press}
}

@INPROCEEDINGS{ICML03-short,
     editor = {Fawcett, Tom and Mishra, Nina},
      title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
  booktitle = {ICML'03},
       year = {-1},
  publisher = {AAAI Press}
}


@INPROCEEDINGS{ICML04,
     editor = {Brodley, Carla E.},
      title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
  booktitle = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
       year = {-1},
  publisher = {ACM}
}

@INPROCEEDINGS{ICML04-short,
     editor = {Brodley, Carla E.},
      title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
  booktitle = {ICML'04},
       year = {-1},
  publisher = {ACM}
}


@INPROCEEDINGS{ICML05-short,
     editor = {Raedt, Luc De and Wrobel, Stefan},
      title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
  booktitle = {ICML'05},
       year = {-1},
  publisher = {ACM}
}


@INPROCEEDINGS{ICML06-short,
     editor = {Cohen, William W. and Moore, Andrew},
      title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
  booktitle = {ICML'06},
       year = {-1},
  publisher = {ACM}
}


@INPROCEEDINGS{ICML07-short,
     editor = {Ghahramani, Zoubin},
      title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
  booktitle = {ICML'07},
       year = {-1},
  publisher = {ACM}
}


@INPROCEEDINGS{ICML08-short,
     editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
      title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
  booktitle = {ICML'08},
       year = {-1},
  publisher = {ACM}
}


@INPROCEEDINGS{ICML09-short,
     editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
      title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
  booktitle = {ICML'09},
       year = {-1},
  publisher = {ACM}
}


@INPROCEEDINGS{ICML96,
     editor = {Saitta, L.},
      title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
  booktitle = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{ICML96-short,
     editor = {Saitta, L.},
      title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
  booktitle = {ICML'96},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{ICML97,
     editor = {Fisher, Douglas H.},
      title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
  booktitle = {Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{ICML97-short,
     editor = {Fisher, Douglas H.},
      title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
  booktitle = {ICML'97},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{ICML98,
     editor = {Shavlik, Jude W.},
      title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
  booktitle = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{ICML98-short,
     editor = {Shavlik, Jude W.},
      title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
  booktitle = {ICML'98},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{ICML99,
     editor = {Bratko, Ivan and Dzeroski, Saso},
      title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
  booktitle = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{ICML99-short,
     editor = {Bratko, Ivan and Dzeroski, Saso},
      title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
  booktitle = {ICML'99},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INCOLLECTION{jaeger+eck:2007,
     author = {Jaeger, H. and Eck, Douglas},
      title = {Can't get you out of my head: {A} connectionist model of cyclic rehearsal},
  booktitle = {Modeling Communications with Robots and Virtual Humans},
     series = {{LNCS}},
       year = {2007},
  publisher = {Springer-Verlag},
        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_jaeger_eck.pdf},
source={OwnPublication},
sourcetype={Chapter},
}

@MISC{James+al-snowbird-2008,
        author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
         title = {Image Classification using Higher-Order Neural Models},
          year = {2008},
  howpublished = {The Learning Workshop (Snowbird, Utah)},
           url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
}

@ARTICLE{JMLR-short,
   journal = {JMLR},
      year = {-1}
}


@INPROCEEDINGS{Kegl+Bertin+Eck-2008,
     author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
      title = {Metropolis-Hastings Sampling in a FilterBoost Music Classifier},
  booktitle = {Music and machine learning workshop (ICML08)},
       year = {2008}
}

@INPROCEEDINGS{kegl2005b,
    author = {K{\'{e}}gl, Bal{\'{a}}zs},
     title = {Generalization Error and Algorithmic Convergence of Median Boosting.},
      year = {2005},
  crossref = {NIPS17-shorter},
  abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.}
}

@ARTICLE{lacoste+eck:eurasip,
    author = {Lacoste, Alexandre and Eck, Douglas},
     title = {A Supervised Classification Algorithm For Note Onset Detection},
   journal = {EURASIP Journal on Applied Signal Processing},
    volume = {2007},
    number = {ID 43745},
      year = {2007},
     pages = {1--13},
source={OwnPublication},
sourcetype={Journal},
}

@MASTERSTHESIS{Lajoie2009,
    author = {Lajoie, Isabelle},
  keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, auto-encodeur d{\'{e}}bruiteur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
     title = {Apprentissage de repr{\'{e}}sentations sur-compl{\`{e}}tes par entra{\^{\i}}nement d’auto-encodeurs},
      year = {2009},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {Les avanc{\'{e}}s dans le domaine de l’intelligence artificielle, permettent {\`{a}} des syst{\`{e}}mes
informatiques de r{\'{e}}soudre des t{\^{a}}ches de plus en plus complexes li{\'{e}}es par exemple {\`{a}}
la vision, {\`{a}} la compr{\'{e}}hension de signaux sonores ou au traitement de la langue. Parmi
les mod{\`{e}}les existants, on retrouve les R{\'{e}}seaux de Neurones Artificiels (RNA), dont la
popularit{\'{e}} a fait un grand bond en avant avec la d{\'{e}}couverte de Hinton et al. [22], soit
l’utilisation de Machines de {Boltzmann} Restreintes (RBM) pour un pr{\'{e}}-entra{\^{\i}}nement
non-supervis{\'{e}} couche apr{\`{e}}s couche, facilitant grandement l’entra{\^{\i}}nement supervis{\'{e}} du
r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es (DBN), entra{\^{\i}}nement qui s’av{\'{e}}rait jusqu’alors tr{\`{e}}s
difficile {\`{a}} r{\'{e}}ussir. Depuis cette d{\'{e}}couverte, des chercheurs ont {\'{e}}tudi{\'{e}} l’efficacit{\'{e}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement, telles que l’empilement d’auto-encodeurs traditionnels (SAE) [5, 38], et l’empilement d’auto-encodeur d{\'{e}}bruiteur (SDAE) [44].
    C’est dans ce contexte qu’a d{\'{e}}but{\'{e}} la pr{\'{e}}sente {\'{e}}tude. Apr{\`{e}}s un bref passage en revue des notions de base du domaine de l’apprentissage machine et des m{\'{e}}thodes de
pr{\'{e}}-entra{\^{\i}}nement employ{\'{e}}es jusqu’{\`{a}} pr{\'{e}}sent avec les modules RBM, AE et DAE, nous
avons approfondi notre compr{\'{e}}hension du pr{\'{e}}-entra{\^{\i}}nement de type SDAE, explor{\'{e}} ses
diff{\'{e}}rentes propri{\'{e}}t{\'{e}}s et {\'{e}}tudi{\'{e}} des variantes de SDAE comme strat{\'{e}}gie d’initialisation
d’architecture profonde. Nous avons ainsi pu, entre autres choses, mettre en lumi{\`{e}}re
l’influence du niveau de bruit, du nombre de couches et du nombre d’unit{\'{e}}s cach{\'{e}}es
sur l’erreur de g{\'{e}}n{\'{e}}ralisation du SDAE. Nous avons constat{\'{e}} une am{\'{e}}lioration de la
performance sur la t{\^{a}}che supervis{\'{e}}e avec l’utilisation des bruits poivre et sel (PS) et
gaussien (GS), bruits s’av{\'{e}}rant mieux justifi{\'{e}}s que celui utilis{\'{e}} jusqu’{\`{a}} pr{\'{e}}sent, soit le
masque {\`{a}} z{\'{e}}ro (MN). De plus, nous avons d{\'{e}}montr{\'{e}} que la performance profitait d’une
emphase impos{\'{e}}e sur la reconstruction des donn{\'{e}}es corrompues durant l’entra{\^{\i}}nement
des diff{\'{e}}rents DAE. Nos travaux ont aussi permis de r{\'{e}}v{\'{e}}ler que le DAE {\'{e}}tait en mesure d’apprendre, sur des images naturelles, des filtres semblables {\`{a}} ceux retrouv{\'{e}}s dans
les cellules V1 du cortex visuel, soit des filtres d{\'{e}}tecteurs de bordures. Nous aurons par
ailleurs pu montrer que les repr{\'{e}}sentations apprises du SDAE, compos{\'{e}}es des caract{\'{e}}ristiques ainsi extraites, s’av{\'{e}}raient fort utiles {\`{a}} l’apprentissage d’une machine {\`{a}} vecteurs de
support ({SVM}) lin{\'{e}}aire ou {\`{a}} noyau gaussien, am{\'{e}}liorant grandement sa performance de
g{\'{e}}n{\'{e}}ralisation. Aussi, nous aurons observ{\'{e}} que similairement au DBN, et contrairement
au SAE, le SDAE poss{\'{e}}dait une bonne capacit{\'{e}} en tant que mod{\`{e}}le g{\'{e}}n{\'{e}}rateur. Nous
avons {\'{e}}galement ouvert la porte {\`{a}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement et d{\'{e}}couvert le potentiel de l’une d’entre elles, soit l’empilement d’auto-encodeurs rebruiteurs
(SRAE).}
}

@INPROCEEDINGS{lamere+eck:ismir2007,
     author = {Lamere, Paul and Eck, Douglas},
     editor = {},
      title = {Using 3D Visualizations to Explore and Discover Music},
  booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
       year = {2007},
  publisher = {},
source={OwnPublication},
sourcetype={Conference},
}

@ARTICLE{Larochelle+al-2010,
    author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph},
     title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest},
   journal = {Neural Computation},
    volume = {22},
    number = {9},
      year = {2010},
     pages = {2285--2307}
}

@INPROCEEDINGS{Larochelle+Bengio-2008,
    author = {Larochelle, Hugo and Bengio, Yoshua},
     title = {Classification using Discriminative Restricted {B}oltzmann Machines},
      year = {2008},
     pages = {536--543},
  crossref = {ICML08-shorter},
  abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
}

@INPROCEEDINGS{Larochelle-2009,
     author = {Larochelle, Hugo and Erhan, Dumitru and Vincent, Pascal},
      title = {Deep Learning using Robust Interdependent Codes},
  booktitle = {Proceedings of the  Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
       year = {2009},
      pages = {312--319},
date = "April 16-18, 2009",
}

@ARTICLE{Larochelle-jmlr-2009,
    author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
     title = {Exploring Strategies for Training Deep Neural Networks},
    volume = {10},
      year = {2009},
     pages = {1--40},
  crossref = {JMLR-shorter},
  abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
}

@PHDTHESIS{Larochelle-PhD-2009,
    author = {Larochelle, Hugo},
  keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, autoassociateur, autoencodeur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
     title = {{\'{E}}tude de techniques d'apprentissage non-supervis{\'{e}} pour l'am{\'{e}}lioration de l'entra{\^{\i}}nement supervis{\'{e}} de mod{\`{e}}les connexionnistes},
      year = {2009},
    school = {University of Montr{\'{e}}al},
  abstract = {Le domaine de l'intelligence artificielle a pour objectif le d{\'{e}}veloppement de syst{\`{e}}mes informatiques capables de simuler des comportements normalement associ{\'{e}}s {\`{a}} l'intelligence humaine. On aimerait entre autres pouvoir construire une machine qui puisse
r{\'{e}}soudre des t{\^{a}}ches li{\'{e}}es {\`{a}} la vision (e.g., la reconnaissance d'objet), au traitement de la langue (e.g., l'identification du sujet d'un texte) ou au traitement de signaux sonores (e.g., la reconnaissance de la parole).
     Une approche d{\'{e}}velopp{\'{e}}e afin de r{\'{e}}soudre ce genre de t{\^{a}}ches est bas{\'{e}}e sur l'apprentissage automatique de mod{\`{e}}les {\`{a}} partir de donn{\'{e}}es {\'{e}}tiquet{\'{e}}es refl{\'{e}}tant le comportement intelligent {\`{a}} {\'{e}}muler. Entre autre, il a {\'{e}}t{\'{e}} propos{\'{e}} de mod{\'{e}}liser le calcul n{\'{e}}cessaire {\`{a}} la
r{\'{e}}solution d'une t{\^{a}}che {\`{a}} l'aide d'un r{\'{e}}seau de neurones artificiel, dont il est possible d'adapter le comportement {\`{a}} l'aide de la r{\'{e}}tropropagation [99, 131] d'un gradient informatif sur les erreurs commises par le r{\'{e}}seau. Populaire durant les ann{\'{e}}es 80, cette
approche sp{\'{e}}cifique a depuis perdu partiellement de son attrait, suite au d{\'{e}}veloppement des m{\'{e}}thodes {\`{a}} noyau. Celles-ci sont souvent plus stables, plus faciles {\`{a}} utiliser et leur performance est souvent au moins aussi {\'{e}}lev{\'{e}}e pour une vaste gamme de probl{\`{e}}mes.
     Les m{\'{e}}thodes d'apprentissage automatique ont donc progress{\'{e}} dans leur fonctionnement, mais aussi dans la complexit{\'{e}} des probl{\`{e}}mes auxquels elles se sont attaqu{\'{e}}. Ainsi, plus r{\'{e}}cemment, des travaux [12, 15] ont commenc{\'{e}} {\`{a}} {\'{e}}mettre des doutes sur la capacit{\'{e}} des machines {\`{a}} noyau {\`{a}} pouvoir efficacement r{\'{e}}soudre des probl{\`{e}}mes de la complexit{\'{e}} requise par l'intelligence artificielle. Parall{\`{e}}lement, Hinton et al. [81] faisaient une perc{\'{e}}e dans l'apprentissage automatique de r{\'{e}}seaux de neurones, en proposant une proc{\'{e}}dure permettant l'entra{\^{\i}}nement de r{\'{e}}seaux de neurones d'une plus grande complexit{\'{e}} (i.e., avec plus de couches de neurones cach{\'{e}}es) qu'il n'{\'{e}}tait possible auparavant.
     C'est dans ce contexte qu'ont {\'{e}}t{\'{e}} conduits les travaux de cette th{\`{e}}se. Cette th{\`{e}}se d{\'{e}}bute par une exposition des principes de base de l'apprentissage automatique (chapitre 1) et une discussion des obstacles {\`{a}} l'obtention d'un mod{\`{e}}le ayant une bonne performance
de g{\'{e}}n{\'{e}}ralisation (chapitre 2). Puis, sont pr{\'{e}}sent{\'{e}}es les contributions apport{\'{e}}es dans le cadre de cinq articles, contributions qui sont toutes bas{\'{e}}es sur l'utilisation d'une certaine
forme d'apprentissage non-supervis{\'{e}}.
    Le premier article (chapitre 4) propose une m{\'{e}}thode d'entra{\^{\i}}nement pour un type sp{\'{e}}cifique de r{\'{e}}seau {\`{a}} une seule couche cach{\'{e}}e (la machine de {Boltzmann} restreinte) bas{\'{e}}e sur une combinaison des apprentissages supervis{\'{e}} et non-supervis{\'{e}}. Cette m{\'{e}}thode permet d'obtenir une meilleure performance de g{\'{e}}n{\'{e}}ralisation qu'un r{\'{e}}seau de neurones standard ou qu'une machine {\`{a}} vecteurs de support {\`{a}} noyau, et met en {\'{e}}vidence de fa{\c c}on
explicite les b{\'{e}}n{\'{e}}fices qu'apporte l'apprentissage non-supervis{\'{e}} {\`{a}} l'entra{\^{\i}}nement d'un r{\'{e}}seau de neurones.
    Ensuite, dans le second article (chapitre 6), on {\'{e}}tudie et {\'{e}}tend la proc{\'{e}}dure d'entra{\^{\i}}nement propos{\'{e}}e par Hinton et al. [81]. Plus sp{\'{e}}cifiquement, on y propose une approche diff{\'{e}}rente mais plus flexible pour initialiser un r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es, bas{\'{e}}e sur un r{\'{e}}seau autoassociateur. On y explore aussi l'impact du nombre de couches et de neurones par couche sur la performance d'un r{\'{e}}seau et on y d{\'{e}}crit diff{\'{e}}rentes variantes mieux adapt{\'{e}}es {\`{a}} l'apprentissage en ligne ou pour donn{\'{e}}es {\`{a}} valeurs continues.
    Dans le troisi{\`{e}}me article (chapitre 8), on explore plut{\^{o}}t la performance de r{\'{e}}seaux profonds sur plusieurs probl{\`{e}}mes de classification diff{\'{e}}rents. Les probl{\`{e}}mes choisis ont la propri{\'{e}}t{\'{e}} d'avoir {\'{e}}t{\'{e}} g{\'{e}}n{\'{e}}r{\'{e}}s {\`{a}} partir de plusieurs facteurs de variation. Cette propri{\'{e}}t{\'{e}}, qui caract{\'{e}}rise les probl{\`{e}}mes li{\'{e}}s {\`{a}} l'intelligence artificielle, pose difficult{\'{e}} aux machines {\`{a}} noyau, tel que confirm{\'{e}} par les exp{\'{e}}riences de cet article.
    Le quatri{\`{e}}me article (chapitre 10) pr{\'{e}}sente une am{\'{e}}lioration de l'approche bas{\'{e}}e sur les r{\'{e}}seaux autoassociateurs. Cette am{\'{e}}lioration applique une modification simple {\`{a}} la proc{\'{e}}dure d'entra{\^{\i}}nement d'un r{\'{e}}seau autoassociateur, en « bruitant » les entr{\'{e}}es du r{\'{e}}seau afin que celui-ci soit forc{\'{e}} {\`{a}} la d{\'{e}}bruiter.
    Le cinqui{\`{e}}me et dernier article (chapitre 12) apporte une autre am{\'{e}}lioration aux r{\'{e}}seaux autoassociateurs, en permettant des interactions d'inhibition ou d'excitation entre les neurones cach{\'{e}}s de ces r{\'{e}}seaux. On y d{\'{e}}montre que de telles interactions peuvent
{\^{e}}tre apprises et sont b{\'{e}}n{\'{e}}fiques {\`{a}} la performance d'un r{\'{e}}seau profond.}
}

@INPROCEEDINGS{Larochelle2008,
     author = {Larochelle, Hugo and Erhan, Dumitru and Bengio, Yoshua},
      title = {Zero-data Learning of New Tasks},
  booktitle = {AAAI Conference on Artificial Intelligence},
       year = {2008},
        url = {http://www-etud.iro.umontreal.ca/~larocheh/publications/aaai2008_zero-data.pdf},
   abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
}

@INPROCEEDINGS{LarochelleH2007,
    author = {Larochelle, Hugo and Erhan, Dumitru and Courville, Aaron and Bergstra, James and Bengio, Yoshua},
     title = {An Empirical Evaluation of Deep Architectures on Problems with Many Factors of Variation},
      year = {2007},
     pages = {473--480},
  crossref = {ICML07-shorter},
  abstract = {Recently, several learning algorithms relying on models with deep architectures have been proposed. Though they have demonstrated impressive performance, to date, they have only been evaluated on relatively simple problems such as digit  recognition in a controlled environment, for which many machine learning algorithms already report reasonable results. Here, we present a series of experiments which indicate that these models show promise in solving harder learning problems that exhibit many factors of variation. These models are compared with well-established algorithms such as Support Vector Machines and single hidden-layer feed-forward neural networks.}
}

@MASTERSTHESIS{Latendresse-MSc,
    author = {Latendresse, Simon},
     title = {L'utilisation d'hyper-param{\`{e}}tres pour la selection de variables},
      year = {1999},
    school = {Universit{\'{e}} de Montreal, Dept. IRO},
      note = {(in French)}
}

@MASTERSTHESIS{Lauzon99,
    author = {Lauzon, Vincent-Philippe},
     title = {Mod{\'{e}}les statistiques comme algorithmes d'apprentissage et {MMCC}s; pr{\'{e}}diction de s{\'{e}}ries financi{\`{e}}res},
      year = {1999},
    school = {D{\'{e}}epartement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
  crossref = {DIRO}
}

@INPROCEEDINGS{lecun-93,
     author = {{LeCun}, Yann and Bengio, Yoshua and Henderson, Donnie and Weisbuch, A. and Weissman, H. and L., Jackel},
      title = {On-line handwriting recognition with neural networks: spatial representation versus temporal representation.},
  booktitle = {Proc. International Conference on handwriting and drawing.},
       year = {1993},
  publisher = {Ecole Nationale Superieure des Telecommunications},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-93.ps.gz},
topics={PriorKnowledge,Speech},cat={C},
}

@INPROCEEDINGS{lecun-99,
     author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
     editor = {Forsyth, D.},
      title = {Object Recognition with Gradient-Based Learning},
  booktitle = {Shape, Contour and Grouping in Computer Vision},
       year = {1999},
      pages = {319-345},
  publisher = {Springer},
        url = {orig/lecun-99.ps.gz},
topics={PriorKnowledge,Speech},cat={B},
}

@TECHREPORT{lecun-99b,
       author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
        title = {Gradient-Based Learning for Object Detection, Segmentation and Recognition},
         year = {1999},
  institution = {AT\&T Labs},
          url = {orig/lecun-99b.ps.gz},
topics={Speech},cat={T},
}

@INPROCEEDINGS{lecun-bengio-94,
     author = {{LeCun}, Yann and Bengio, Yoshua},
      title = {Word-level training of a handwritten word recognizer based on convolutional neural networks},
  booktitle = {Proc. of the International Conference on Pattern Recognition},
     volume = {II},
       year = {1994},
      pages = {88--92},
  publisher = {IEEE},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-word.ps},
   abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. Words are represented by low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
topics={Speech},cat={C},
}

@INPROCEEDINGS{lecun-bengio-95a,
     author = {{LeCun}, Yann and Bengio, Yoshua},
     editor = {Arbib, M. A.},
      title = {Convolutional Networks for Images, Speech, and Time-Series},
  booktitle = {The Handbook of Brain Theory and Neural Networks},
       year = {1995},
      pages = {255--257},
  publisher = {MIT Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-convo.pdf},
topics={PriorKnowledge,Speech},cat={C},
}

@INCOLLECTION{lecun-bengio-95b,
     author = {{LeCun}, Yann and Bengio, Yoshua},
     editor = {Arbib, M. A.},
      title = {Pattern Recognition and Neural Networks},
  booktitle = {The Handbook of Brain Theory and Neural Networks},
       year = {1995},
      pages = {711--714},
  publisher = {MIT Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-patrec.pdf},
topics={PriorKnowledge,Speech},cat={B},
}

@ARTICLE{LeCun98,
    author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
     title = {Gradient-Based Learning Applied to Document Recognition},
   journal = {Proceedings of the IEEE},
    volume = {86},
    number = {11},
      year = {1998},
     pages = {2278--2324},
  abstract = {Multilayer Neural Networks trained with the backpropagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. 
Real-life document recognition systems are composed or multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to minimize an overall performance measure. 
Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks. 
A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with global training techniques to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
topics={PriorKnowledge,Speech},cat={C},
}

@INPROCEEDINGS{Lecun_icassp97,
     author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
      title = {Reading Checks with graph transformer networks},
  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
     volume = {1},
       year = {1997},
      pages = {151--154},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-bottou-bengio-97.ps.gz},
topics={Speech},cat={C},
}

@ARTICLE{LeRoux+Bengio-2010,
    author = {Le Roux, Nicolas and Bengio, Yoshua},
     title = {Deep Belief Networks are Compact Universal Approximators},
   journal = {Neural Computation},
    volume = {22},
    number = {8},
      year = {2010},
     pages = {2192-2207},
      issn = {0899-7667},
  abstract = {Deep Belief Networks (DBN) are generative models with many layers of hidden causal variables, recently introduced by Hinton et al. (2006), along with a greedy layer-wise unsupervised learning algorithm. Building on Le Roux and Bengio (2008) and Sutskever and Hinton (2008), we show that deep but narrow generative networks do not require more parameters than shallow ones to achieve universal approximation. Exploiting the proof technique, we prove that deep but narrow feed-forward neural networks with sigmoidal units can represent any Boolean expression.}
}

@TECHREPORT{LeRoux-Bengio-2007-TR,
       author = {Le Roux, Nicolas and Bengio, Yoshua},
        title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
       number = {1294},
         year = {2007},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {Deep Belief Networks (DBN) are generative neural network models with
many layers of hidden explanatory factors, recently introduced by Hinton et al.,
along with a greedy layer-wise unsupervised learning algorithm. The building
block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine
(RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines
are interesting because inference is easy in them, and because they have been
successfully used as building blocks for training deeper models.  
We first prove that adding hidden units yields strictly improved modeling
power, while a second theorem shows that RBMs are universal approximators of
discrete distributions.  We then study the question of whether DBNs with more
layers are strictly more powerful in terms of representational power. This
suggests a new and less greedy criterion for training RBMs within DBNs.}
}

@ARTICLE{LeRoux-Bengio-2008,
    author = {Le Roux, Nicolas and Bengio, Yoshua},
     title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
   journal = {Neural Computation},
    volume = {20},
    number = {6},
      year = {2008},
     pages = {1631--1649},
  abstract = {Deep Belief Networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton et al., along with a greedy layer-wise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine (RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines are interesting because inference is easy in them, and because they have been successfully used as building blocks for training deeper models.  We first prove that adding hidden units yields strictly improved modelling power, while a second theorem shows that RBMs are universal approximators of discrete distributions.  We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}
}

@INPROCEEDINGS{LeRoux-continuous,
     author = {Le Roux, Nicolas and Bengio, Yoshua},
      title = {Continuous Neural Networks},
  booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
       year = {2007},
  publisher = {Omnipress},
   abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based learning. While having the same number of parameters as an ordinary neural network, its internal structure suggests that it can represent some smooth functions much more compactly. Under mild assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this parametrization may help reducing the problem of saturation of the neurons. In a second approach, the input-to-hidden weights arefully non-parametric, yielding a kernel machine for which we demonstrate a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free and still generalizes in spite of an absence of explicit regularization.}
}

@PHDTHESIS{LeRoux-PhD-2008,
    author = {Le Roux, Nicolas},
     title = {Avanc{\'{e}}es th{\'{e}}oriques sur la repr{\'{e}}sentation et l'optimisation des r{\'{e}}seaux de neurones},
      year = {2008},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {Les r{\'{e}}seaux de neurones artificiels ont {\'{e}}t{\'{e}} abondamment utilis{\'{e}}s dans la communaut{\'{e}} de l'apprentissage machine depuis les ann{\'{e}}es 80. Bien qu'ils aient {\'{e}}t{\'{e}} {\'{e}}tudi{\'{e}}s pour la premi{\`{e}}re fois il y a cinquante ans par Rosenblatt [68], ils ne furent r{\'{e}}ellement populaires qu'apr{\`{e}}s l'apparition de la r{\'{e}}tropropagation du gradient, en 1986 [71].
En 1989, il a {\'{e}}t{\'{e}} prouv{\'{e}} [44] qu'une classe sp{\'{e}}cifique de r{\'{e}}seaux de neurones (les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e) {\'{e}}tait suffisamment puissante pour pouvoir approximer presque n'importe quelle fonction avec une pr{\'{e}}cision arbitraire : le th{\'{e}}or{\`{e}}me d'approximation universelle. Toutefois, bien que ce th{\'{e}}or{\`{e}}me e{\^{u}}t pour cons{\'{e}}quence un int{\'{e}}r{\^{e}}t accru pour les r{\'{e}}seaux de neurones, il semblerait qu'aucun effort n'ait {\'{e}}t{\'{e}} fait pour profiter de cette propri{\'{e}}t{\'{e}}.
En outre, l'optimisation des r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e n'est pas convexe. Cela a d{\'{e}}tourn{\'{e}} une grande partie de la communaut{\'{e}} vers d'autres algorithmes, comme par exemple les machines {\`{a}} noyau (machines {\`{a}} vecteurs de support et r{\'{e}}gression
{\`{a}} noyau, entre autres).
La premi{\`{e}}re partie de cette th{\`{e}}se pr{\'{e}}sentera les concepts d'apprentissage machine g{\'{e}}n{\'{e}}raux n{\'{e}}cessaires {\`{a}} la compr{\'{e}}hension des algorithmes utilis{\'{e}}s. La deuxi{\`{e}}me partie se focalisera plus sp{\'{e}}cifiquement sur les m{\'{e}}thodes {\`{a}} noyau et les r{\'{e}}seaux de neurones. La troisi{\`{e}}me partie de ce travail visera ensuite {\`{a}} {\'{e}}tudier les limitations des machines {\`{a}} noyaux et {\`{a}} comprendre les raisons pour lesquelles elles sont inadapt{\'{e}}es {\`{a}} certains probl{\`{e}}mes que nous avons {\`{a}} traiter.
La quatri{\`{e}}me partie pr{\'{e}}sente une technique permettant d'optimiser les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e de mani{\`{e}}re convexe. Bien que cette technique s'av{\`{e}}re difficilement exploitable pour des probl{\`{e}}mes de grande taille, une version approch{\'{e}}e permet d'obtenir une bonne solution dans un temps raisonnable.
La cinqui{\`{e}}me partie se concentre sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e infinie. Cela leur permet th{\'{e}}oriquement d'exploiter la propri{\'{e}}t{\'{e}} d'approximation universelle et ainsi d'approcher facilement une plus grande classe de fonctions.
Toutefois, si ces deux variations sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e leur conf{\`{e}}rent des propri{\'{e}}t{\'{e}}s int{\'{e}}ressantes, ces derniers ne peuvent extraire plus que des concepts de bas niveau. Les m{\'{e}}thodes {\`{a}} noyau souffrant des m{\^{e}}mes limites, aucun de
ces deux types d'algorithmes ne peut appr{\'{e}}hender des probl{\`{e}}mes faisant appel {\`{a}} l'apprentissage de concepts de haut niveau.
R{\'{e}}cemment sont apparus les Deep Belief Networks [39] qui sont des r{\'{e}}seaux de neurones {\`{a}} plusieurs couches cach{\'{e}}es entra{\^{\i}}n{\'{e}}s de mani{\`{e}}re efficace. Cette profondeur leur permet d'extraire des concepts de haut niveau et donc de r{\'{e}}aliser des t{\^{a}}ches hors
de port{\'{e}}e des algorithmes conventionnels. La sixi{\`{e}}me partie {\'{e}}tudie des propri{\'{e}}t{\'{e}}s de ces r{\'{e}}seaux profonds.
Les probl{\`{e}}mes que l'on rencontre actuellement n{\'{e}}cessitent non seulement des algorithmes capables d'extraire des concepts de haut niveau, mais {\'{e}}galement des m{\'{e}}thodes d'optimisation capables de traiter l'immense quantit{\'{e}} de donn{\'{e}}es parfois disponibles, si possible en temps r{\'{e}}el. La septi{\`{e}}me partie est donc la pr{\'{e}}sentation d'une nouvelle technique permettant une optimisation plus rapide.}
}

@ARTICLE{lheureux-04,
    author = {{L'Heureux}, Pierre-Jean and Carreau, Julie and Bengio, Yoshua and Delalleau, Olivier and Yue, Shi Yi},
     title = {Locally Linear Embedding for dimensionality reduction in {QSAR}},
   journal = {Journal of Computer-Aided Molecular Design},
    volume = {18},
      year = {2004},
     pages = {475--482},
  abstract = {Current practice in Quantitative Structure Activity Relationship (QSAR) methods usually involves generating a great number of chemical descriptors and then cutting them back with variable selection techniques. Variable selection is an effective method to reduce the dimensionality but may discard some valuable information. This paper introduces Locally Linear Embedding ({LLE}), a local non-linear dimensionality reduction technique, that can statistically discover a low-dimensional representation of the chemical data. {LLE} is shown to create more stable representations than other non-linear dimensionality
reduction algorithms, and to be capable of capturing non-linearity in chemical data.},
topics={Bioinformatic},cat={J},
}

@TECHREPORT{lm-TR00,
       author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
        title = {A Neural Probabilistic Language Model},
       number = {1178},
         year = {2000},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1178.pdf},
     abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made or words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model, and that the proposed approach allows to take advantage of much longer contexts.},
topics={Markov,Unsupervised,Language},cat={T},
}

@INPROCEEDINGS{Maillet+al-2009,
     author = {Maillet, Fran{\c c}ois and Eck, Douglas and Desjardins, Guillaume and Lamere, Paul},
      title = {Steerable Playlist Generation by Learning Song Similarity from Radio Station Playlists},
  booktitle = {Proceedings of the 10th International Conference on Music Information Retrieval},
       year = {2009},
        url = {http://www-etud.iro.umontreal.ca/~mailletf/papers/ismir09-playlist.pdf},
   abstract = {This paper presents an approach to generating steerable playlists. We first demonstrate a method for learning song transition probabilities from audio features extracted from songs played in professional radio station playlists. We then show that by using this learnt similarity function as a prior, we are able to generate steerable playlists by choosing the next song to play not simply based on that prior, but on a tag cloud that the user is able to manipulate to express the high-level characteristics of the music he wishes  Last.fm, to listen to.}
}

@INPROCEEDINGS{manzagol+bertinmahieux+eck:ismir2008,
     author = {Manzagol, Pierre-Antoine and Bertin-Mahieux, Thierry and Eck, Douglas},
      title = {On the Use of Sparse Time-Relative Auditory Codes for Music},
  booktitle = {{Proceedings of the 9th International Conference on Music Information Retrieval ({ISMIR} 2008)}},
       year = {2008},
   abstract = {Many if not most audio features used in MIR research are inspired by work done in speech recognition and are variations on the spectrogram. Recently, much attention has been given to new representations of audio that are sparse and time-relative. These representations are efficient and able to avoid the time-frequency trade-off of a spectrogram. Yet little work with music streams has been conducted and these features remain mostly unused in the MIR community. In this paper we further explore the use of these features for musical signals. In particular, we investigate their use on realistic music examples (i.e. released commercial music) and their use as input features for supervised learning. Furthermore, we identify three specific issues related to these features which will need to be further addressed in order to obtain the full benefit for MIR applications.},
source={OwnPublication},
sourcetype={Conference},
}

@MASTERSTHESIS{Manzagol-Msc-2007,
    author = {Manzagol, Pierre-Antoine},
       key = {Algorithme d'apprentissage, méthode de second ordre, gradient naturel, approximation stochastique},
     title = {TONGA - Un algorithme de gradient naturel pour les probl{\`{e}}mes de grande taille},
      year = {2007},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {Les syst{\`{e}}mes adaptatifs sont confront{\'{e}}s {\`{a}} des donn{\'{e}}es qui {\'{e}}voluent rapidement en quantit{\'{e}} et en complexit{\'{e}}. Les avanc{\'{e}}es mat{\'{e}}rielles de l'informatique ne susent pas {\`{a}} compenser cet essor. Une mise {\`{a}} l'{\'{e}}chelle des techniques d'apprentissage est n{\'{e}}cessaire. D'une part, les mod{\`{e}}les doivent gagner en capacit{\'{e}} de repr{\'{e}}sentation. De l'autre, les algorithmes d'apprentissage doivent devenir plus ecaces.
    Nos travaux se situent dans ce contexte des probl{\`{e}}mes de grande taille et portent sur l'am{\'{e}}lioration des algorithmes d'apprentissage. Deux {\'{e}}l{\'{e}}ments de r{\'{e}}ponse sont d{\'{e}}j{\`{a}} connus. Il s'agit des m{\'{e}}thodes de second ordre et de l'approximation stochastique. Or, les m{\'{e}}thodes de second ordre poss{\`{e}}dent des complexit{\'{e}}s en calculs et en m{\'{e}}moire qui sont prohibitives dans le cadre des probl{\`{e}}mes de grande taille. {\'{E}}galement, il est notoirement dicile de concilier ces m{\'{e}}thodes avec l'approximation stochastique. TONGA est un algorithme d'apprentissage con{\c c}u pour faire face {\`{a}} ces dicult{\'{e}}s. Il s'agit d'une implantation stochastique et adapt{\'{e}}e aux probl{\`{e}}mes de grande taille d'une m{\'{e}}thode de second ordre, le gradient naturel. Dans ce m{\'{e}}moire, nous examinons de pr{\`{e}}s ce nouvel algorithme d'apprentissage en le comparant sur plusieurs probl{\`{e}}mes au gradient stochastique, la technique d'optimisation commun{\'{e}}ment utilis{\'{e}}e dans le cadre des probl{\`{e}}mes de grande taille. Nos exp{\'{e}}riences montrent que TONGA est au moins tout aussi ecace que le gradient stochastique, ce qui est un accomplissement en soit. Dans certains cas, TONGA offre une convergence nettement sup{\'{e}}rieure {\`{a}} celle du gradient stochastique.}
}

@INPROCEEDINGS{matic-94,
     author = {Matic, N. and Henderson, Donnie and {LeCun}, Yann and Bengio, Yoshua},
      title = {Pen-based visitor registration system (PENGUIN)},
  booktitle = {Conference Record of the Twenty-Eighth Asilomar Conference on Signals, Systems and Computers},
       year = {1994},
  publisher = {IEEE},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/matic-94.tiff},
   abstract = {We describe a new electronic pen-based visitors registration system (PENGUIN) whose goal is to expand and modernize the visitor sign-in procedure at Bell Laboratories. The system uses a pen-interface (i.e. tablet-display) in what is essentially a form filling application. Our pen-interface is coupled with a powerful and accurate on-line handwriting recognition module. A database of AT&T employees (the visitors' hosts) and country names is used to check the recognition module outputs, in order to find the best match. The system provides assistance to the guard at one of the guard stations in routing visitors to their hosts. All the entered data are stored electronically. Initial testing shows that PENGUIN system performs reliably and with high accuracy. It retrieves the correct host name with 97\% accuracy and the correct visitors citizenship with 99\% accuracy. The system is robust and easy to use for both visitors and guards},
topics={Speech},cat={C},
}

@UNPUBLISHED{mirex2005artist,
    author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
     title = {Artist Recognition: A Timbre- and Rhythm-Based Multiresolution Approach},
      year = {2005},
      note = {{MIREX} artist recognition contest},
source={OwnPublication},
sourcetype={Other},
}

@UNPUBLISHED{mirex2005genre,
    author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
     title = {Genre Classification: Timbre- and Rhythm-Based Multiresolution Audio Classification},
      year = {2005},
      note = {{MIREX} genre classification contest},
source={OwnPublication},
sourcetype={Other},
}

@UNPUBLISHED{mirex2005note,
    author = {Lacoste, Alexandre and Eck, Douglas},
     title = {Onset Detection with Artificial Neural Networks},
      year = {2005},
      note = {{MIREX} note onset detection contest},
source={OwnPublication},
sourcetype={Other},
}

@UNPUBLISHED{mirex2005tempo,
    author = {Eck, Douglas and Casagrande, Norman},
     title = {A Tempo-Extraction Algorithm Using an Autocorrelation Phase Matrix and Shannon Entropy},
      year = {2005},
      note = {{MIREX} tempo extraction contest (www.music-ir.org/\-evaluation/\-mirex-results)},
source={OwnPublication},
sourcetype={Other},
}

@INPROCEEDINGS{mitacs-insurance01,
     author = {Bengio, Yoshua and Chapados, Nicolas and Dugas, Charles and Ghosn, Joumana and Takeuchi, Ichiro and Vincent, Pascal},
      title = {High-Dimensional Data Inference for Automobile Insurance Premia Estimation},
  booktitle = {Presented at the 2001 MITACS Annual Meeting},
       year = {2001},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mitacs_insurance.ps},
topics={HighDimensional,Mining},cat={C},
}

@INPROCEEDINGS{Morin+al-2005,
     author = {Morin, Frederic and Bengio, Yoshua},
     editor = {Cowell, Robert G. and Ghahramani, Zoubin},
      title = {Hierarchical Probabilistic Neural Network Language Model},
  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
       year = {2005},
      pages = {246--252},
  publisher = {Society for Artificial Intelligence and Statistics},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf},
   abstract = {In recent years, variants of a neural network architecture for statistical language modeling have been proposed and successfully applied, e.g. in the language modeling component of speech recognizers. The main advantage of these architectures is that they learn an embedding for words (or other symbols) in a continuous space that helps to smooth the language model and provide good generalization even when the number of training examples is insufficient. However, these models are extremely slow in comparison to the more commonly used n-gram models, both for training and recognition. As an alternative to an importance sampling method proposed to speed-up training, we introduce a hierarchical decomposition of the conditional probabilities that yields a speed-up of about 200 both during training and recognition. The hierarchical decomposition is a binary hierarchical clustering constrained by the prior knowledge extracted from the WordNet semantic hierarchy.},
topics={Language},cat={C},
}

@TECHREPORT{Nadeau-inference-TR99,
       author = {Nadeau, Claude and Bengio, Yoshua},
        title = {Inference and the Generalization Error},
       number = {99s-45},
         year = {1999},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/techrep.pdf},
     abstract = {We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets and test examples. This allows us to propose two new estimators of this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered in (Dietterich, 1998). In particular, tests of hypothesis based on these don’t tend to be too liberal like other tests currently available, and have good power.},
topics={Comparative},cat={T},
}

@INPROCEEDINGS{nadeau:2000:nips,
    author = {Nadeau, Claude and Bengio, Yoshua},
     title = {Inference for the Generalization Error},
      year = {2000},
     pages = {307--313},
  crossref = {NIPS12-shorter},
  abstract = {In order to to compare learning algorithms, experimental results reported in the machine learning litterature often use statistical tests of significance. Unfortunately, most of these tests do not take into account the variability due to the choice of training set. We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets. This allows us to propose two new ways to estimate this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered by Dietterich (Dietterich, 1998).},
topics={Comparative},cat={C},
}

@ARTICLE{nadeau:2001,
    author = {Nadeau, Claude and Bengio, Yoshua},
     title = {Inference for the Generalization Error},
   journal = {Machine Learning},
      year = {2001},
  abstract = {In order to compare learning algorithms, experimental results reported in the machine learning literature often use statistical tests of significance to support the claim that a new learning algorithm generalizes better. Such tests should take into account the variability due to the choice of training set and not only that due to the test examples, as is often the case. This could lead to gross underestimation of the variance of the cross-validation estimator, and to the wrong conclusion that the new algorithm is significantly better when it is not. We perform a theoretical investigation of the variance of a cross-validation estimator of the generalization error that takes into account the variability due to the randomness of the training set as well as test examples. Our analysis shows that all the variance estimators that are based only on the results of the cross-validation experiment must be biased. This analysis allows us to propose new estimators of this variance. We show, via simulations, that tests of hypothesis about the generalization error using those new variance estimators have better properties than tests involving variance estimators currently in use and listed in (Dietterich, 1998). In particular, the new tests have correct size and good power. That is, the new tests do not reject the null hypothesis too often when the hypothesis is true, but they tend to frequently reject the null hypothesis when  the latter is false.},
topics={Comparative},cat={J},
}

@ARTICLE{NC06,
    author = {Bengio, Yoshua and Monperrus, Martin and Larochelle, Hugo},
     title = {Nonlocal Estimation of Manifold Structure},
   journal = {Neural Computation},
    volume = {18},
      year = {2006},
     pages = {2509--2528},
  abstract = {We claim and present arguments to the effect that a large class of manifold
learning algorithms that are essentially local and can be framed as
kernel learning algorithms will suffer from the curse of dimensionality, at
the dimension of the true underlying manifold. This observation suggests
to explore non-local manifold learning algorithms which attempt to discover
shared structure in the tangent planes at different positions. A criterion for
such an algorithm is proposed and experiments estimating a tangent plane
prediction function are presented, showing its advantages with respect to
local manifold learning algorithms: it is able to generalize very far from
training data (on learning handwritten character image rotations), where a
local non-parametric method fails.},
topics={HighDimensional,Kernel,Unsupervised},cat={J},
}

@INPROCEEDINGS{NIPS1-short,
     editor = {Touretzky, D. S.},
      title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
  booktitle = {NIPS 1},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{NIPS10-short,
     editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
      title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
  booktitle = {NIPS 10},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS11,
     editor = {Kearns, M.J. and Solla, S.A.},
      title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
  booktitle = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS11-short,
     editor = {Kearns, M.J. and Solla, S.A.},
      title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
  booktitle = {NIPS 11},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS12-short,
     editor = {Solla, S.A. and Leen, T. K.},
      title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
  booktitle = {NIPS 12},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS13-short,
     editor = {Leen, T. K. and Dietterich, T.G.},
      title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
  booktitle = {NIPS 13},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS14,
     editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
      title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
  booktitle = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS14-short,
     editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
      title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
  booktitle = {NIPS 14},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS15-short,
     editor = {Becker, S. and Thrun, Sebastian},
      title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
  booktitle = {NIPS 15},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS16-short,
     editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
      title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
  booktitle = {NIPS 16},
       year = {-1}
}


@INPROCEEDINGS{NIPS17-short,
     editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
      title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
  booktitle = {NIPS 17},
       year = {-1}
}


@INPROCEEDINGS{NIPS18-short,
     editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
      title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
  booktitle = {NIPS 18},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS19-short,
     editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
      title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
  booktitle = {NIPS 19},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS2-short,
     editor = {Touretzky, D. S.},
      title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
  booktitle = {NIPS 2},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{NIPS20-short,
     editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
      title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
  booktitle = {NIPS 20},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS2003_AA65,
     author = {Bengio, Yoshua and Grandvalet, Yves},
   keywords = {cross validation, error bars, generalization error inference, k-fold cross-validation, model selection, statistical comparison of algorithms, variance estimate},
      title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
       year = {2004},
  publisher = {MIT Press},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/var-kfold-part1-nips.pdf},
   crossref = {NIPS16},
   abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare algorithm performances. In order to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the estimation of uncertainty around the K-fold cross-validation estimator. The main theorem shows that there exists no universal unbiased estimator of the variance of K-fold cross-validation. An analysis based on the eigendecomposition of the covariance matrix of errors helps to better understand the nature of the problem and shows that naive estimators may grossly underestimate variance, as confirmed by numerical experiments.},
topics={Comparative},cat={C},
}

@INCOLLECTION{NIPS2005_424,
    author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
     title = {The Curse of Highly Variable Functions for Local Kernel Machines},
      year = {2006},
     pages = {107--114},
  crossref = {NIPS18-shorter},
  abstract = {We present a series of theoretical arguments supporting the claim that a
large class of modern learning algorithms that rely solely on the smoothness
prior – with similarity between examples expressed with a local
kernel – are sensitive to the curse of dimensionality, or more precisely
to the variability of the target. Our discussion covers supervised, semisupervised
and unsupervised learning algorithms. These algorithms are
found to be local in the sense that crucial properties of the learned function
at x depend mostly on the neighbors of x in the training set. This
makes them sensitive to the curse of dimensionality, well studied for
classical non-parametric statistical learning. We show in the case of the
Gaussian kernel that when the function to be learned has many variations,
these algorithms require a number of training examples proportional to
the number of variations, which could be large even though there may exist
short descriptions of the target function, i.e. their Kolmogorov complexity
may be low. This suggests that there exist non-local learning
algorithms that at least have the potential to learn about such structured
but apparently complex functions (because locally they have many variations),
while not using very specific prior domain knowledge.},
topics={HighDimensional,Kernel,Unsupervised},cat={C},
}

@INPROCEEDINGS{NIPS2005_456,
    author = {K{\'{e}}gl, Bal{\'{a}}zs and Wang, Ligen},
     title = {Boosting on Manifolds: Adaptive Regularization of Base Classifiers},
      year = {2005},
     pages = {665--672},
  crossref = {NIPS17-shorter},
  abstract = {In this paper we propose to combine two powerful ideas, boosting and manifold learning. On the one hand, we improve ADABOOST by incorporating knowledge on the structure of the data into base classifier design and selection. On the other hand, we use ADABOOST’s efficient learning mechanism to significantly improve supervised and semi-supervised algorithms proposed in the context of manifold learning. Beside the specific manifold-based penalization, the resulting algorithm also accommodates the boosting of a large family of regularized learning algorithms.},
topics={Boosting},cat={C},
}

@INCOLLECTION{NIPS2005_519,
    author = {Grandvalet, Yves and Bengio, Yoshua},
     title = {Semi-supervised Learning by Entropy Minimization},
      year = {2005},
     pages = {529--236},
  crossref = {NIPS17-shorter},
  abstract = {We consider the semi-supervised learning problem, where a decision rule is to be learned from labeled and unlabeled data. In this framework, we motivate minimum entropy regularization, which enables to incorporate unlabeled data in the standard supervised learning. Our approach includes other approaches to the semi-supervised problem as particular or limiting cases. A series of experiments illustrates that the proposed solution benefits from unlabeled data. The method challenges mixture models when the data are sampled from the distribution class spanned by the generative model. The performances are definitely in favor of minimum entropy regularization when generative models are misspecified, and the weighting of unlabeled data provides robustness to the violation of the “cluster assumption”. Finally, we also illustrate that the method can also be far superior to manifold learning in high dimension spaces.},
topics={Unsupervised},cat={C},
}

@INPROCEEDINGS{NIPS2005_539,
    author = {Bengio, Yoshua and Larochelle, Hugo and Vincent, Pascal},
     title = {Non-Local Manifold Parzen Windows},
      year = {2006},
  crossref = {NIPS18-shorter},
  abstract = {To escape from the curse of dimensionality, we claim that one can learn
non-local functions, in the sense that the value and shape of the learned
function at x must be inferred using examples that may be far from x.
With this objective, we present a non-local non-parametric density estimator.
It builds upon previously proposed Gaussian mixture models with
regularized covariance matrices to take into account the local shape of
the manifold. It also builds upon recent work on non-local estimators of
the tangent plane of a manifold, which are able to generalize in places
with little training data, unlike traditional, local, non-parametric models.},
topics={HighDimensional,Kernel,Unsupervised},cat={C},
}

@INPROCEEDINGS{NIPS2005_583,
    author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
     title = {Convex Neural Networks},
      year = {2006},
     pages = {123--130},
  crossref = {NIPS18-shorter},
  abstract = {Convexity has recently received a lot of attention in the machine learning
community, and the lack of convexity has been seen as a major disadvantage
of many learning algorithms, such as multi-layer artificial neural
networks. We show that training multi-layer neural networks in which the
number of hidden units is learned can be viewed as a convex optimization
problem. This problem involves an infinite number of variables, but can be
solved by incrementally inserting a hidden unit at a time, each time finding
a linear classifier that minimizes a weighted sum of errors.},
topics={Boosting},cat={C},
}

@INPROCEEDINGS{NIPS2005_663,
    author = {Rivest, Fran{\c c}ois and Bengio, Yoshua and Kalaska, John},
     title = {Brain Inspired Reinforcement Learning},
      year = {2005},
     pages = {1129--1136},
  crossref = {NIPS17-shorter},
  abstract = {Successful application of reinforcement learning algorithms often involves considerable hand-crafting of the necessary non-linear features to reduce the complexity of the value functions and hence to promote convergence of the algorithm. In contrast, the human brain readily and autonomously finds the complex features when provided with sufficient training. Recent work in machine learning and neurophysiology has demonstrated the role of the basal ganglia and the frontal cortex in mammalian reinforcement learning. This paper develops and explores new reinforcement learning algorithms inspired by neurological evidence that provides potential new approaches to the feature construction problem. The algorithms are compared and evaluated on the Acrobot task.},
topics={BioRules},cat={C},
}

@INCOLLECTION{NIPS2005_691,
    author = {Bengio, Yoshua and Monperrus, Martin},
     title = {Non-Local Manifold Tangent Learning},
      year = {2005},
     pages = {129--136},
  crossref = {NIPS17-shorter},
  abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local and can be framed as kernel learning algorithms will suffer from the curse of dimensionality, at the dimension of the true underlying manifold. This observation suggests to explore non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented, showing its advantages with respect to local manifold learning algorithms: it is able to generalize very far from training data (on learning handwritten character image rotations), where a local non-parametric method fails.},
topics={HighDimensional,Unsupervised},cat={C},
}

@INPROCEEDINGS{NIPS2005_874,
    author = {K{\'{e}}gl, Bal{\'{a}}zs},
     title = {Generalization Error and Algorithmic Convergence of Median Boosting},
      year = {2005},
     pages = {657--664},
  crossref = {NIPS17-shorter},
  abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.},
topics={Boosting},cat={C},
}

@INPROCEEDINGS{NIPS2007-56,
    author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
     title = {Topmoumoute online natural gradient algorithm},
      year = {2008},
  crossref = {NIPS20-shorter},
  abstract = {Guided by the goal of obtaining an optimization algorithm that is both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
}

@INPROCEEDINGS{NIPS2007-812,
    author = {Chapados, Nicolas and Bengio, Yoshua},
     title = {Augmented Functional Time Series Representation and Forecasting with Gaussian Processes},
      year = {2008},
     pages = {265--272},
  crossref = {NIPS20-shorter},
  abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
}

@INPROCEEDINGS{NIPS2007-925,
    author = {Le Roux, Nicolas and Bengio, Yoshua and Lamblin, Pascal and Joliveau, Marc and K{\'{e}}gl, Bal{\'{a}}zs},
     title = {Learning the 2-D Topology of Images},
      year = {2008},
     pages = {841--848},
  crossref = {NIPS20-shorter},
  abstract = {We study the following question: is the two-dimensional structure of images a very strong prior or is it something that can be learned with a few examples of natural images? If someone gave us a learning task involving images for which the two-dimensional topology of pixels was not known, could we discover it automatically and exploit it? For example suppose that the pixels had been permuted in a fixed but unknown way, could we recover the relative two-dimensional location of pixels on images? The surprising result presented here is that not only the answer is yes but that about as few as a thousand images are enough to approximately recover the relative locations of about a thousand pixels. This is achieved using a manifold learning algorithm applied to pixels associated with a measure of distributional similarity between pixel intensities. We compare different topologyextraction approaches and show how having the two-dimensional topology can be exploited.}
}

@INPROCEEDINGS{NIPS21,
     editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
      title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
  booktitle = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
       year = {-1},
  publisher = {Nips Foundation (http://books.nips.cc)}
}

@INPROCEEDINGS{NIPS21-short,
     editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
      title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
  booktitle = {NIPS 21},
       year = {-1},
  publisher = {Nips Foundation (http://books.nips.cc)}
}


@INPROCEEDINGS{NIPS22-short,
     editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
      title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
  booktitle = {NIPS 22},
       year = {-1}
}


@INPROCEEDINGS{NIPS3,
     editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
      title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
  booktitle = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{NIPS3-short,
     editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
      title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
  booktitle = {NIPS 3},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{NIPS4-short,
     editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
      title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
  booktitle = {NIPS 4},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{NIPS5,
     editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
      title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
  booktitle = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{NIPS5-short,
     editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
      title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
  booktitle = {NIPS 5},
       year = {-1},
  publisher = {Morgan Kaufmann}
}


@INPROCEEDINGS{NIPS6-short,
     editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
      title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
  booktitle = {NIPS 6},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS7-short,
     editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
      title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
  booktitle = {NIPS 7},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS8-short,
     editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
      title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
  booktitle = {NIPS 8},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{NIPS9-short,
     editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
      title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
  booktitle = {NIPS 9},
       year = {-1},
  publisher = {MIT Press}
}


@INPROCEEDINGS{nnlm:2001:nips,
    author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
     title = {A Neural Probabilistic Language Model},
      year = {2001},
  crossref = {NIPS13-shorter},
  abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words. This is intrinsically difficult because of the curse of dimensionality: we propose to fight it with its own weapons. In the proposed approach one learns simultaneously (1) a distributed representation for each word (i.e. a similarity between words) along with (2) the probability function for word sequences, expressed with these representations. Generalization is obtained because a sequence of words that
has never been seen before gets high probability if it is made of words that are similar to words forming an already seen sentence. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model.},
topics={Markov,Unsupervised,Language},cat={C},
}

@INPROCEEDINGS{nsvn:2000:ijcnn,
     author = {Vincent, Pascal and Bengio, Yoshua},
      title = {A Neural Support Vector Network Architecture with Adaptive Kernels},
  booktitle = {International Joint Conference on Neural Networks 2000},
     volume = {V},
       year = {2000},
      pages = {187--192},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nsvn.pdf},
   abstract = {In the Support Vector Machines ({SVM}) framework, the positive-definite kernel can be seen as representing a fixed similarity measure between two patterns, and a discriminant function is obtained by taking a linear combination of the kernels computed at training examples called support vectors. Here we investigate learning architectures in which the kernel functions can be replaced by more general similarity measures that can have arbitrary internal parameters. The training criterion used in {SVM}s is not appropriate for this purpose so we adopt the simple criterion that is generally used when training neural networks for classification tasks. Several experiments are performed which show that such Neural Support Vector Networks perform similarly to {SVM}s while requiring significantly fewer support vectors, even when the similarity measure has no internal parameters.},
topics={Kernel},cat={C},
}

@INPROCEEDINGS{Ouimet+al-2005,
     author = {Ouimet, Marie and Bengio, Yoshua},
     editor = {Cowell, Robert G. and Ghahramani, Zoubin},
      title = {Greedy Spectral Embedding},
  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
       year = {2005},
      pages = {253--260},
  publisher = {Society for Artificial Intelligence and Statistics},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/greedy-kernel-aistats05.pdf},
   abstract = {Spectral dimensionality reduction methods and spectral clustering methods require computation of the principal eigenvectors of an n X n matrix where n is the number of examples. Following up on previously proposed techniques to speed-up kernel methods by focusing on a subset of m examples, we study a greedy selection procedure for this subset, based on the feature space distance between a candidate example and the span of the previously chosen ones. In the case of kernel {PCA} or spectral clustering this reduces computation to O(m^2 n). For the same computational complexity, we can also compute the feature space projection of the non-selected examples on the subspace spanned by the selected examples, to estimate the embedding function based on all the data, which yields considerably better estimation of the embedding function. This algorithm can be formulated in an online setting and we can bound the error on the approximation of the Gram matrix.},
topics={HighDimensional,kenel},cat={C},
}

@MASTERSTHESIS{Ouimet-Msc-2004,
    author = {Ouimet, Marie},
  keywords = {algorithmes voraces., apprentissage non-supervis{\'{e}}, m{\'{e}}thodes spectrales, noyaux, r{\'{e}}duction de dimensionnalit{\'{e}}},
     title = {R{\'{e}}duction de dimensionnalit{\'{e}} non lin{\'{e}}aire et vorace},
      year = {2004},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {Les m{\'{e}}thodes spectrales de r{\'{e}}duction de dimensionnalit{\'{e}} et les m{\'{e}}thodes de segmentation spectrale exigent le calcul des vecteurs propres principaux d'une matrice de taille n x n o{\`{u}} n est le nombre d'exemples. Des techniques ont {\'{e}}t{\'{e}} propos{\'{e}}es dans la litt{\'{e}}rature pour acc{\'{e}}l{\'{e}}rer les m{\'{e}}thodes {\`{a}} noyau en se concentrant sur un sous-ensemble de m exemples. Nous proposons une proc{\'{e}}dure vorace pour la s{\'{e}}lection de ce sous-ensemble, qui est bas{\'{e}}e sur la distance dans l'espace des caract{\`{e}}ristiques entre un exemple candidat et le sous-espace g{\'{e}}n{\'{e}}r{\'{e}} par les exemples pr{\'{e}}c{\'{e}}demment choisis. Dans le cas de l'ACP {\`{a}} noyau ou de la segmentation spectrale, nous obtenons un algorithme en O(m*m*n), o{\`{u}} m << n, qui, contrairement aux techniques pr{\'{e}}c{\'{e}}demment propos{\'{e}}es, peut se formuler de fa{\c c}on en-ligne. Pour la m{\^{e}}me complexit{\'{e}} en temps, nous pouvons {\'{e}}galement calculer la projection des exemples non choisis sur le sous-espace engendr{\'{e}} par les exemples choisis dans l'espace des caract{\'{e}}ristiques. En repr{\'{e}}sentant ainsi les exemples par leur projection nous obtenons une approximation de plus faible rang de la matrice de Gram sur toutes les donn{\'{e}}es. Nous pouvons {\'{e}}galement borner l'erreur correspondant {\`{a}} cette approximation de la matrice de Gram.}
}

@ARTICLE{paiement+bengio+eck:aij,
    author = {Paiement, Jean-Fran{\c c}ois and Bengio, Samy and Eck, Douglas},
     title = {Probabilistic Models for Melodic Prediction},
   journal = {Artificial Intelligence Journal},
    volume = {173},
      year = {2009},
     pages = {1266-1274},
source={OwnPublication},
sourcetype={Journal},
}

@INPROCEEDINGS{paiement+eck+bengio+barber:icml2005,
     author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy and Barber, D.},
      title = {A graphical model for chord progressions embedded in a psychoacoustic space},
       year = {2005},
      pages = {641--648},
  publisher = {ACM Press},
   crossref = {ICML05},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{paiement+eck+bengio:ccai2006,
     author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
     editor = {Lamontagne, Luc and Marchand, Mario},
      title = {Probabilistic Melodic Harmonization},
  booktitle = {Canadian Conference on AI},
     series = {Lecture Notes in Computer Science},
     volume = {4013},
       year = {2006},
      pages = {218-229},
  publisher = {Springer},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{paiement+eck+bengio:ismir2005,
     author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
      title = {A Probabilistic Model for Chord Progressions},
  booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
       year = {2005},
      pages = {312-319},
source={OwnPublication},
sourcetype={Conference},
}

@INPROCEEDINGS{paiement+grandvalet+bengio+eck:icml2008,
    author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
     title = {A generative model for rhythms},
      year = {2008},
     pages = {},
  crossref = {ICML06-shorter},
source={OwnPublication},
sourcetype={Conference},
}

@UNPUBLISHED{paiement+grandvalet+bengio+eck:nipsworkshop2007,
    author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
     title = {A generative model for rhythms},
      year = {2007},
      note = {NIPS 2007 Workshop on Music, Brain and Cognition},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@MASTERSTHESIS{Paiement-Msc-2003,
    author = {Paiement, Jean-Fran{\c c}ois},
  keywords = {algorithmes, apprentissage, apprentissage non supervis{\'{e}}, forage de donn{\'{e}}es, noyaux, r{\'{e}}duction de dimensions, statistique, Statistiques},
     title = {G{\'{e}}n{\'{e}}ralisation d'algorithmes de r{\'{e}}duction de dimension},
      year = {2003},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {On pr{\'{e}}sente tout d'abord la notion de vari{\'{e}}t{\'{e}} comme r{\'{e}}gion de faible dimension contenant des observations situ{\'{e}}es dans un espace de haute dimension. Cette d{\'{e}}finition justifie l'{\'{e}}laboration d'algorithmes permettant d'exprimer les donn{\'{e}}es dans un syst{\`{e}}me de coordonn{\'{e}}es de dimensions {\'{e}}gale {\`{a}} celle de la vari{\'{e}}t{\'{e}} sur laquelle les donn{\'{e}}es sont approximativement situ{\'{e}}es.
La notion de noyau comme mesure de similarit{\'{e}} est par la suite formalis{\'{e}}e. On constate que l'application d'un noyau {\`{a}} deux observations correspond {\`{a}} l'{\'{e}}valuation d'un produit scalaire dans un espace de Hilbert appel{\'{e}} espace de caract{\'{e}}ristiques.
Une m{\'{e}}thode de r{\'{e}}duction de dimension lin{\'{e}}raire est expos{\'{e}}e ainsi que ces limites. Des algorithmes non lin{\'{e}}raires de r{\'{e}}duction de dimension et de segmentation permettent de s'affranchir de ces limites. Ces derniers ne fournissent cependant pas d'extension directe {\`{a}} des points hors {\'{e}}chantillon.
L'{\'{e}}tape fondamentale au sein des algorithmes pr{\'{e}}sent{\'{e}}s est la solution d'un syst{\`{e}}me de vecteurs propres d'une matrice sym{\'{e}}trique cr{\'{e}}{\'{e}}e {\`{a}} partir d'un noyau d{\'{e}}pendant des donn{\'{e}}es. On con{\c c}oit cd probl{\`{e}}me comme le fait de trouver les fonctions propres d'un op{\'{e}}rateur lin{\'{e}}aire d{\'{e}}fini {\`{a}} partir du m{\^{e}}me noyau. On utilise alors la formulation de Nystr{\"{o}}m, pr{\'{e}}sente dans l'espace en composantes principales {\`{a}} noyaux, afin de r{\'{e}}duire la dimension des points hors {\'{e}}chantillon sur la vase des plongements obtenus {\`{a}} l'aide des algorithmes d{\'{e}}j{\`{a}} mentionn{\'{e}}s.
La qualit{\'{e}} de la projection g{\'{e}}n{\'{e}}r{\'{e}}e est compar{\'{e}}e {\`{a}} la perturbation intrins{\`{e}}que des algorithmes si on substitue certaine observations par d'autres tir{\'{e}}es de la m{\^{e}}me distribution.}
}

@ARTICLE{perez+gers+schmidhuber+eck:2002,
    author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
     title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
   journal = {Neural Networks},
    volume = {16},
    number = {2},
      year = {2003},
  abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
source={OwnPublication},
sourcetype={Journal},
}

@ARTICLE{perez+gers+schmidhuber+eck:2003,
    author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
     title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
   journal = {Neural Networks},
    volume = {16},
    number = {2},
      year = {2003},
     pages = {241--250},
  abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
source={OwnPublication},
sourcetype={Journal},
}

@INPROCEEDINGS{perez+schmidhuber+gers+eck:icannB2002,
     author = {Perez-Ortiz, J. A. and Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
     editor = {Dorronsoro, J.},
      title = {Improving Long-Term Online Prediction with {Decoupled Extended Kalman Filters}},
  booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
       year = {2002},
      pages = {1055--1060},
  publisher = {Springer},
   abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) outperform traditional {RNN}s when dealing with sequences involving not only short-term but also long-term dependencies. The decoupled extended Kalman filter learning algorithm ({DEKF}) works well in online environments and reduces significantly the number of training steps when compared to the standard gradient-descent algorithms. Previous work on {LSTM}, however, has always used a form of gradient descent and has not focused on true online situations. Here we combine {LSTM} with {DEKF} and show that this new hybrid improves upon the original learning algorithm when applied to online processing.},
source={OwnPublication},
sourcetype={Conference},
}

@TECHREPORT{Pigeon-Bengio-96-aH-TR,
       author = {Pigeon, Steven and Bengio, Yoshua},
        title = {A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols},
       number = {\#1081},
         year = {1997},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/HuffAdapt.pdf},
     abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithm implicitly assumes that either we have a small alphabet —  quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbol, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the number of observed symbols.},
topics={Compression},cat={T},
}

@INPROCEEDINGS{Pigeon-dcc98,
     author = {Pigeon, Steven and Bengio, Yoshua},
     editor = {Society, {IEEE} Computer},
      title = {A Memory-Efficient Adaptive Huffman Coding Algorithm for Very Large Sets of Symbols},
  booktitle = {Data Compression Conference},
       year = {1998},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc98.pdf},
   abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithms implicitly assumes that either we have a small alphabet — quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to
encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbols, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the size of the alphabet.},
topics={Compression},cat={C},
}

@INPROCEEDINGS{Pigeon-dcc99,
     author = {Pigeon, Steven and Bengio, Yoshua},
     editor = {Society, {IEEE} Computer},
      title = {Binary Pseudowavelets and Applications to Bilevel Image Processing},
  booktitle = {Data Compression Conference},
       year = {1999},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc99.pdf},
   abstract = {This paper shows the existance of binary pseudowavelets, bases on the binary domain that exhibit some of the properties of wavelets, such as multiresolution reconstruction and compact support. The binary pseudowavelets are defined on _n (binary vectors of length n) and are operated upon with the binary operators logical and and exclusive or. The forward transform, or analysis, is the decomposition of a binary vector into its constituant binary pseudowavelets. Binary pseudowavelets allow multiresolution, progressive reconstruction of binary vectors by using progressively more coefficients in the inverse transform. Binary pseudowavelets bases, being sparse matrices, also provide for fast transforms; moreover pseudowavelets rely on hardware-friendly operations for efficient software and hardware implementation.},
topics={Compression},cat={C},
}

@TECHREPORT{Pigeon-Huffman-TR98,
       author = {Pigeon, Steven and Bengio, Yoshua},
        title = {A Memory-Efficient Adaptive Huffman Coding for Very Large Sets of Symbols revisited},
       number = {1095},
         year = {1998},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TechRep_AdaptativeHuffman2.pdf},
     abstract = {While algorithm M (presented in A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols, by Steven Pigeon & Yoshua Bengio, Universit{\'{e}} de Montr{\'{e}}al technical report #1081 [1]) converges to the entropy of the signal, it also assumes that the characteristics of the signal are stationary, that is, that they do not change over time and that successive adjustments, ever decreasing in their magnitude, will lead to a reasonable approximation of the entropy. While this is true for some data, it is clearly not true for some other. We present here a modification of the M algorithm that allows negative updates. Negative updates are used to maintain a window over the source. Symbols enter the window at its right and will leave it at its left, after w steps (the window width). The algorithm presented here allows us to update correctly the weights of the symbols in the symbol tree. Here, we will also have negative migration or demotion, while we only had positive migration or promotion in M. This algorithm will be called M+.},
topics={Compression},cat={T},
}

@PHDTHESIS{Pigeon-Phd-2001,
    author = {Pigeon, Steven},
  keywords = {algorithmes, codes adaptatifs, codes de Golomb, codes universels, Compression de donn{\'{e}}es, compression LZ78, LZW, ondelettes, pseudo-ondelettes},
     title = {Contributions {\`{a}} la compression de donn{\'{e}}es},
      year = {2001},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {L'objectif de cette th{\`{e}}se est de pr{\'{e}}senter nos contributions {\`{a}} la compression de donn{\'{e}}es. Le texte entier n'est pas consacr{\'{e}} {\`{a}} nos seules contributions. Une large part est consacr{\'{e}}e au mat{\'{e}}riel introductif et {\`{a}} la recension de litt{\'{e}}rature sur les sujets qui sont pertinents {\`{a}} nos contributions. Le premier chapitre de contribution, le chapitre "Contribution au codage des entiers" se concentre sur le probl{\`{e}}me de la g{\'{e}}n{\'{e}}ration de codes efficaces pour les entiers. Le chapitre "Codage Huffman Adaptatif" pr{\'{e}}sente deux nouveaux algorithmes pour la g{\'{e}}n{\'{e}}ration dynamique de codes structur{\'{e}}s en arbre, c'est-{\`{a}}-dire des codes de type Huffman. Le chapitre "LZW avec une perte" explore le probl{\`{e}}me de la compression d'images comportant un petit nombre de couleurs distinctes et propose une extension avec perte d'un algorithme originalement sans perte, LZW. Enfin, le dernier chapitre de contribution, le chapitre "Les pseudo-ondelettes binaires" pr{\'{e}}sente une solution original au probl{\`{e}}me de l'analyse multir{\'{e}}solution des images monochromes, c'est-{\`{a}}-dire des images n'ayant que deux couleurs, conventionnellement noir et blanc. Ce type d'image correspond par exemple aux images textuelles telle que produites par un processus de transmission de type facsimil{\'{e}}.}
}

@ARTICLE{Pigeon98,
    author = {Pigeon, Steven and Bengio, Yoshua},
     title = {Memory-Efficient Adaptive Huffman Coding},
   journal = {Dr. Dobb's Journal},
    volume = {290},
      year = {1998},
     pages = {131--135},
topics={Compression},cat={J},
}

@INPROCEEDINGS{probnn:2000:ijcnn,
     author = {Bengio, Yoshua},
      title = {Probabilistic Neural Network Models for Sequential Data},
  booktitle = {International Joint Conference on Neural Networks 2000},
     volume = {V},
       year = {2000},
      pages = {79--84},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/81_01.PDF},
   abstract = {It has already been shown how Artificial Neural Networks ({ANN}s) can be incorporated into probabilistic models.
In this paper we review some of the approaches which have been proposed to incorporate them into probabilistic
models of sequential data, such as Hidden {Markov} Models ({HMM}s). We also discuss new developments and new
ideas in this area, in particular how {ANN}s can be used to model high-dimensional discrete and continuous data to
deal with the curse of dimensionality, and how the ideas proposed in these models could be applied to statistical
language modeling to represent longer-term context than allowed by trigram models, while keeping word-order
information.},
topics={Markov},cat={C},
}

@UNPUBLISHED{pugin+burgoyne+eck+fujinaga:nipsworkshop2007,
    author = {Pugin, L. and Burgoyne, J. A. and Eck, Douglas and Fujinaga, I.},
     title = {Book-adaptive and book-dependant models to accelerate digitalization of early music},
      year = {2007},
      note = {NIPS 2007 Workshop on Music, Brain and Cognition},
source={OwnPublication},
sourcetype={Workshop},
optkey={""},
optmonth={""},
optannote={""},
}

@INPROCEEDINGS{Rahim-97,
     author = {Rahim, Mazin and Bengio, Yoshua and {LeCun}, Yann},
      title = {Discriminative feature and model design for automatic speech recognition},
  booktitle = {Proceedings of Eurospeech 1997},
       year = {1997},
      pages = {75--78},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rahim-bengio-lecun-97.ps.gz},
   abstract = {A system for discriminative feature and model design is presented for automatic speech recognition. Training based on minimum classification error with a single objective function is applied for designing a set of parallel networks performing feature transformation and a set of hidden {Markov} models performing speech recognition. This paper compares the use of linear and non-linear functional transformations when applied to conventional recognition features, such as spectrum or cepstrum. It also provides a framework for integrated feature and model training when using class-specific transformations. Experimental results on telephone-based connected digit recognition are presented.},
topics={Speech},cat={C},
}

@ARTICLE{Rivest-2009,
    author = {Rivest, Fran{\c c}ois and Kalaska, John and Bengio, Yoshua},
     title = {Alternative Time Representations in Dopamine Models},
   journal = {Journal of Computational Neuroscience},
    volume = {28},
    number = {1},
      year = {2009},
     pages = {107--130},
  abstract = {Dopaminergic neuron activity has been modeled during learning and appetitive behavior, most commonly using the temporal-difference (TD) algorithm. However, a proper representation of elapsed time and of the exact task is usually required for the model to work. Most models use timing elements such as delay-line representations of time that are not biologically realistic for intervals in the range of seconds. The interval-timing literature provides several alternatives. One of them is that timing could emerge from general network dynamics, instead of coming from a dedicated circuit. Here, we present a general rate-based learning model based on long short-term memory ({LSTM}) networks that learns a time representation when needed. Using a na{\"{\i}}ve network learning its environment in conjunction with TD, we reproduce dopamine activity in appetitive trace conditioning with a constant CS-US interval, including probe trials with unexpected delays. The proposed model learns a representation of the environment dynamics in an adaptive biologically plausible framework, without recourse to delay lines or other special-purpose circuits. Instead, the model predicts that the task-dependent representation of time is learned by experience, is encoded in ramp-like changes in single-neuron activity distributed across small neural networks, and reflects a temporal integration mechanism resulting from the inherent dynamics of recurrent loops within the network. The model also reproduces the known finding that trace conditioning is more difficult than delay conditioning and that the learned representation of the task can be highly dependent on the types of trials experienced during training. Finally, it suggests that the phasic dopaminergic signal could facilitate learning in the cortex.}
}

@ARTICLE{schmidhuber+gers+eck:2002,
    author = {Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
     title = {Learning Nonregular Languages: A Comparison of Simple Recurrent Networks and {LSTM}},
   journal = {Neural Computation},
    volume = {14},
    number = {9},
      year = {2002},
     pages = {2039--2041},
  abstract = {In response to Rodriguez' recent article (Rodriguez 2001) we compare the performance of simple recurrent nets and {\em ``Long Short-Term Memory''} ({LSTM}) recurrent nets on context-free and context-sensitive languages.},
source={OwnPublication},
sourcetype={Journal},
}

@TECHREPORT{Schwenk-Bengio-97-TR,
       author = {Schwenk, Holger and Bengio, Yoshua},
        title = {Adaptive Boosting of Neural Networks for Character Recognition},
       number = {\#1072},
         year = {1997},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/AdaBoostTR.pdf},
     abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
topics={Boosting,Speech},cat={T},
}

@INPROCEEDINGS{Schwenk-nips10,
    author = {Schwenk, Holger and Bengio, Yoshua},
     title = {Training Methods for Adaptive Boosting of Neural Networks for Character Recognition},
      year = {1998},
  crossref = {NIPS10-shorter},
  abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
topics={Boosting,Speech},cat={C},
}

@ARTICLE{Schwenk2000,
    author = {Schwenk, Holger and Bengio, Yoshua},
     title = {Boosting Neural Networks},
   journal = {Neural Computation},
    volume = {12},
    number = {8},
      year = {2000},
     pages = {1869--1887},
  abstract = {“Boosting” is a general method for improving the performance of learning algorithms. A recently proposed boosting algorithm is AdaBoost. It has been applied with great success to several benchmark machine learning problems using mainly decision trees as base classifiers. In this paper we investigate whether AdaBoost also works as well with neural networks, and we discuss the advantages and drawbacks of di_erent versions of the AdaBoost algorithm. In particular, we compare training methods based on sampling the training set and weighting the cost function. The results suggest that random resampling of the training data is not the main explanation of the success of the improvements brought by AdaBoost. This is in contrast to Bagging which directly aims at reducing variance and for which random resampling is essential to obtain the reduction in generalization error. Our system achieves about 1.4\% error on a data set of online handwritten digits from more than 200 writers. A boosted multi-layer network achieved 1.5\% error on the UCI Letters and 8.1\% error on the UCI satellite data set, which is significantly better than boosted decision trees.},
topics={Boosting},cat={J},
}

@INPROCEEDINGS{secondorder:2001:nips,
    author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
     title = {Incorporating Second-Order Functional Knowledge for Better Option Pricing},
      year = {2001},
  crossref = {NIPS13-shorter},
  abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in two of its arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of continuous functions with these and other properties. We apply this new class of functions to the task of modeling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.},
topics={Finance},cat={C},
}

@ARTICLE{Sonnenburg+al-2007,
    author = {Sonnenburg, Soeren and et al. and Vincent, Pascal},
     title = {The Need for Open Source Software in Machine Learning.},
      year = {2007},
      note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
  crossref = {JMLR-shorter},
  abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.

Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
}

@ARTICLE{Takeuchi-Bengio-Kanamori-2002,
    author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
     title = {Robust Regression with Asymmetric Heavy-Tail Noise Distributions},
   journal = {Neural Computation},
    volume = {14},
    number = {10},
      year = {2002},
     pages = {2469--2496},
  abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they down-weight  the influence of so-called outliers. When the noise distribution is assymetric these methods yield biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regession that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust etimators of the regression), and to lern a few remaining parameters to combbine and correct these stimators, to unbiasedly minimize the average squared error. Theoritical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
topics={Mining},cat={J},
}

@PHDTHESIS{ThesisChapados2010,
    author = {Chapados, Nicolas},
     title = {Sequential Machine learning Approaches for Portfolio Management},
      year = {2010},
    school = {Universit{\'{e}} de Montr{\'{e}}al},
  abstract = {[English follow]
Cette th{\`{e}}se envisage un ensemble de m{\'{e}}thodes permettant aux algorithmes d'apprentissage statistique de mieux traiter la nature s{\'{e}}quentielle des probl{\`{e}}mes de gestion de portefeuilles financiers. Nous d{\'{e}}butons par une consid{\'{e}}ration du probl{\`{e}}me g{\'{e}}n{\'{e}}ral de la composition d'algorithmes d'apprentissage devant g{\'{e}}rer des t{\^{a}}ches s{\'{e}}quentielles, en particulier celui de la mise-{\`{a}}-jour efficace des ensembles d'apprentissage dans un cadre de validation s{\'{e}}quentielle. Nous {\'{e}}num{\'{e}}rons les desiderata que des primitives de composition doivent satisfaire, et faisons ressortir la difficult{\'{e}} de les atteindre de fa{\c c}on rigoureuse et efficace. Nous poursuivons en pr{\'{e}}sentant un ensemble d'algorithmes qui atteignent ces objectifs et pr{\'{e}}sentons une {\'{e}}tude de cas d'un syst{\`{e}}me complexe de prise de d{\'{e}}cision financi{\`{e}}re utilisant ces techniques. Nous d{\'{e}}crivons ensuite une m{\'{e}}thode g{\'{e}}n{\'{e}}rale permettant de transformer un probl{\`{e}}me de d{\'{e}}cision s{\'{e}}quentielle non-Markovien en un probl{\`{e}}me d'apprentissage supervis{\'{e}} en employant un algorithme de recherche bas{\'{e}} sur les K meilleurs chemins. Nous traitons d'une application en gestion de portefeuille o{\`{u}} nous entra{\^{\i}}nons un algorithme d'apprentissage {\`{a}} optimiser directement un ratio de Sharpe (ou autre crit{\`{e}}re non-additif incorporant une aversion au risque). Nous illustrons l'approche par une {\'{e}}tude exp{\'{e}}rimentale approfondie, proposant une architecture de r{\'{e}}seaux de neurones sp{\'{e}}cialis{\'{e}}e {\`{a}} la gestion de portefeuille et la comparant {\`{a}} plusieurs alternatives. Finalement, nous introduisons une repr{\'{e}}sentation fonctionnelle de s{\'{e}}ries chronologiques permettant {\`{a}} des pr{\'{e}}visions d'{\^{e}}tre effectu{\'{e}}es sur un horizon variable, tout en utilisant un ensemble informationnel r{\'{e}}v{\'{e}}l{\'{e}} de mani{\`{e}}re progressive. L'approche est bas{\'{e}}e sur l'utilisation des processus Gaussiens, lesquels fournissent une matrice de covariance compl{\`{e}}te entre tous les points pour lesquels une pr{\'{e}}vision est demand{\'{e}}e. Cette information est utilis{\'{e}}e {\`{a}} bon escient par un algorithme qui transige activement des {\'{e}}carts de cours (price spreads) entre des contrats {\`{a}} terme sur commodit{\'{e}}s. L'approche propos{\'{e}}e produit, hors {\'{e}}chantillon, un rendement ajust{\'{e}} pour le risque significatif, apr{\`{e}}s frais de transactions, sur un portefeuille de 30 actifs.
This thesis considers a number of approaches to make machine learning algorithms better suited to the sequential nature of financial portfolio management tasks. We start by considering the problem of the general composition of learning algorithms that must handle temporal learning tasks, in particular that of creating and efficiently updating the training sets in a sequential simulation framework. We enumerate the desiderata that composition primitives should satisfy, and underscore the difficulty of rigorously and efficiently reaching them. We follow by introducing a set of algorithms that accomplish the desired objectives, presenting a case-study of a real-world complex learning system for financial decision-making that uses those techniques. We then describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-best paths search algorithm. We consider an application in financial portfolio management where we train a learning algorithm to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating extensive experimental results using a neural network architecture specialized for portfolio management and compare against well-known alternatives. Finally, we introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
}

@ARTICLE{Thierry+al-2008,
    author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul},
     title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases},
   journal = {Journal of New Music Research},
      year = {2008},
  abstract = {Social tags are user-generated keywords associated with some resource on the Web. In the case of music, social tags have become an important component of "Web 2.0" recommender systems, allowing users to generate playlists based on use-dependent terms such as chill or jogging that have been applied to particular songs. In this paper, we propose a method for predicting these social tags directly from MP3 files. Using a set of 360 classifiers trained using the online ensemble learning algorithm FilterBoost, we map audio features onto social tags collected from the Web. The resulting automatic tags (or autotags) furnish information about music that is otherwise untagged or poorly tagged, allowing for insertion of previously unheard music into a social recommender. This avoids the “cold-start problem” common in such systems. Autotags can also be used to smooth the tag space from which similarities and
recommendations are made by providing a set of comparable  baseline tags for all tracks in a recommender system. Because the words we learn are the same as those used by people who label their music collections, it is easy to integrate our predictions into existing similarity and prediction methods based on web data.}
}

@ARTICLE{Thivierge+al-2007,
    author = {Thivierge, J. -P. and Rivest, Fran{\c c}ois and Monchi, O},
     title = {Spiking Neurons, Dopamine, and Plasticity: Timing Is Everything, But Concentration Also Matters},
   journal = {Synapse},
    volume = {61},
      year = {2007},
     pages = {375-390},
  abstract = {While both dopamine (DA) fluctuations and spike-timing-dependent plasticity (STDP) are known to influence long-term corticostriatal plasticity, little attention has been devoted to the interaction between these two fundamental mechanisms. Here, a theoretical framework is proposed to account for experimental results specifying the role of presynaptic activation, postsynaptic activation, and concentrations of extracellular DA in synaptic plasticity. Our starting point was an explicitly-implemented multiplicative rule linking STDP to Michaelis-Menton equations that models the dynamics of extracellular DA fluctuations. This rule captures a wide range of results on conditions leading to long-term potentiation and depression in simulations that manipulate the frequency of induced corticostriatal stimulation and DA release. A well-documented biphasic function relating DA concentrations to synaptic plasticity emerges naturally from simulations involving a multiplicative rule linking DA and neural activity. This biphasic function is found consistently across different neural coding schemes employed (voltage-based vs. spike-based models). By comparison, an additive rule fails to capture these results. The proposed framework is the first to generate testable predictions on the dual influence of DA concentrations and STDP on long-term plasticity, suggesting a way in which the biphasic influence of DA concentrations can modulate the direction and magnitude of change induced by STDP, and raising the possibility that DA concentrations may inverse the LTP/LTD components of the STDP rule.}
}

@TECHREPORT{tonga-tr,
       author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
        title = {Topmoumoute online natural gradient algorithm},
       number = {1299},
         year = {2007},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
     abstract = {Guided by the goal of obtaining an optimization algorithm that is
both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
}

@TECHREPORT{TR1197,
       author = {Vincent, Pascal and Bengio, Yoshua},
        title = {K-Local Hyperplane and Convex Distance Nearest Neighbor Algorithms},
       number = {1197},
         year = {2001},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1197.pdf},
     abstract = {Guided by an initial idea of building a complex (non linear) decision surface with maximal local margin in input space, we give a possible geometrical intuition as to why K-Nearest Neighbor ({KNN}) algorithms often perform more poorly than {SVM}s on classification tasks. We then propose modified K-Nearest Neighbor algorithms to overcome the perceived problem. The approach is similar in spirit to Tangent Distance, but with invariances inferred from the local neighborhood rather than prior knowledge. Experimental results on real world classification tasks suggest that the modified {KNN} algorithms often give a dramatic improvement over standard {KNN} and perform as well or better than {SVM}s.},
topics={Kernel},cat={T},
}

@TECHREPORT{TR1198,
       author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
        title = {Robust Regression with Asymmetric Heavy-Tail Noise},
       number = {1198},
         year = {2001},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1198.pdf},
     abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they downweight the influence of so-called outliers. When the noise distribution is asymmetric these methods yield strongly biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regression that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust estimators of the regression), and to learn a few remaining parameters to combine and correct these estimators, to minimize the average squared error. Theoretical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
topics={Mining},cat={T},
}

@TECHREPORT{TR1199,
       author = {Chapados, Nicolas and Bengio, Yoshua and Vincent, Pascal and Ghosn, Joumana and Dugas, Charles and Takeuchi, Ichiro and Meng, Linyan},
        title = {Estimating Car Insurance Premia: a Case Study in High-Dimensional Data Inference},
       number = {1199},
         year = {2001},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1199.pdf},
     abstract = {Estimating insurance premia from data is a difficult regression problem for several reasons: the large number of variables, many of which are discrete, and the very peculiar shape of the noise distribution, asymmetric with fat tails, with a large majority zeros and a few unreliable and very large values. We introduce a methodology for estimating insurance premia that has been applied in the car insurance industry. It is based on mixtures of specialized neural networks, in order to reduce the effect of outliers on the estimation. Statistical comparisons with several different alternatives, including decision trees and generalized linear models show that the proposed method is significantly more precise, allowing to identify the least and most risky contracts, and reducing the median premium by charging more to the most risky customers.},
topics={HighDimensional,Mining},cat={T},
}

@TECHREPORT{TR1200,
       author = {Bengio, Yoshua and Chapados, Nicolas},
        title = {Extending Metric-Based Model Selection and Regularization in the Absence of Unlabeled Data},
       number = {1200},
         year = {2001},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/lisa/pointeurs/TR1200.ps},
     abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over all the alternatives tried (including cross-validation). However, these methods require a large set of unlabeled data, which is not always available in many applications. In this paper we extend these methods (TRI, ADJ and ADA) to the case where no unlabeled data is available. The extended methods (xTRI, xADJ, xADA) use a model of the input density directly estimated from the training set. The intuition is that the main reason why the above methods work well is that they make sure that the learned function behaves similarly on the training points and on “neighboring” points. The experiments are based on estimating a simple non-parametric density model. They show that the extended methods perform comparably to the originals even though no unlabeled data is used.},
topics={ModelSelection,Finance},cat={T},
}

@TECHREPORT{TR1215,
       author = {Bengio, Yoshua},
        title = {New Distributed Probabilistic Language Models},
       number = {1215},
         year = {2002},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1215.ps},
     abstract = {Our previous work on statistical modeling introduced the use of probabilistic feedforward neural networks with shared parameters in order to help dealing with the curse of dimensionality. This work started with the motivation to speed up the above model and to take advantage of prior knowledge e.g., in WordNet or in syntactically labeled data sets, and to better deal with polysemy. With the objective of reaching these goals, we present here a series of new statistical language models, most of which are yet untested.},
topics={Markov,Language,Unsupervised},cat={T},
}

@TECHREPORT{TR1216,
       author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
        title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
       number = {1216},
         year = {2002},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1216.ps},
     abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we proposed and evaluate sampling-based methods which require network passes only for the observed “positive example” and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.},
topics={Markov,Language,Unsupervised},cat={T},
}

@TECHREPORT{TR1231,
       author = {Bengio, Yoshua and Kermorvant, Christopher},
        title = {Extracting Hidden Sense Probabilities from Bitexts},
       number = {1231},
         year = {2003},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1231.pdf},
     abstract = {We propose a probabilistic model that is inspired by Diab & Resnik’s algorithm to extract disambiguation information from aligned bilingual texts. Like Diab & Resnik’s, the proposed model uses WordNet and the fact that word ambiguities are not always the same in the two languages. The generative model introduces a dependency between two translated words through a common ancestor inWordNet’s ontology. Unlike Diab & Resnik’s algorithm it does not suppose that the translation in the source language has a single meaning.},
topics={Language},cat={T},
}

@TECHREPORT{TR1232,
       author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois},
        title = {Learning Eigenfunctions of Similarity: Linking Spectral Clustering and Kernel {PCA}},
       number = {1232},
         year = {2003},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1232.pdf},
     abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a Hilbert space whose inner product is defined with respect to a density model. This suggests a new approach to unsupervised learning in which abstractions (such as manifolds and clusters) that represent the main features of the data density are extracted. Abstractions discovered at one level can be used to build higher-level abstractions. This paper also discusses how these abstractions can be used to recover a quantitative model of the input density, which is at least useful for evaluative and comparative purposes.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@TECHREPORT{TR1234,
       author = {Bengio, Yoshua and Grandvalet, Yves},
        title = {No Unbiased Estimator of the Variance of K-Fold Cros-Validation},
       number = {1234},
         year = {2003},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1234.pdf},
     abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important for them to also estimate the uncertainty around the error (or error difference) estimate. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make na{\"{\i}}ve estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
topics={Comparative},cat={T},
}

@TECHREPORT{tr1238,
       author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal},
        title = {Out-of-Sample Extensions for {LLE}, {I}somap, {MDS}, {E}igenmaps, and Spectral Clustering},
       number = {1238},
         year = {2003},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1238.pdf},
     abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides algorithms for such an extension for Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (all algorithms which provide lower-dimensional embedding for dimensionality reduction) as well as for Spectral Clustering (which performs non-Gaussian clustering). These extensions stem from a unified framework in which these algorithms are seen as learning eigenfunctions of a kernel. {LLE} and Isomap pose special challenges as the kernel is training-data dependent. Numerical experiments on real data show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms to the choice of training data.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@TECHREPORT{tr1239,
       author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois and Delalleau, Olivier and Ouimet, Marie and Le Roux, Nicolas},
        title = {Spectral Clustering and Kernel {PCA} are Learning Eigenfunctions},
       number = {1239},
         year = {2003},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1239.pdf},
     abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a function space whose scalar product is defined with respect to a density model. This defines a natural mapping for new data points, for methods that only provided an embedding, such as spectral clustering and Laplacian eigenmaps. The analysis hinges on a notion of generalization for embedding algorithms based on the estimation of underlying eigenfunctions, and suggests ways to improve this generalization by smoothing the data empirical distribution.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@TECHREPORT{tr1240,
       author = {Vincent, Pascal and Bengio, Yoshua},
        title = {Locally Weighted Full Covariance Gaussian Density Estimation},
       number = {1240},
         year = {2003},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1240.pdf},
     abstract = {We describe an interesting application of the principle of local learning to density estimation. Locally weighted fitting of a Gaussian with a regularized full covariance matrix yields a density estimator which displays improved behavior in the case where much of the probability mass is concentrated along a low dimensional manifold. While the proposed estimator is not guaranteed to integrate to 1 with a finite sample size, we prove asymptotic convergence to the true density. Experimental results illustrating the advantages of this estimator over classic non-parametric estimators are presented.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@TECHREPORT{tr1247,
       author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
        title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
       number = {1247},
         year = {2004},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1247.pdf},
     abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed non-parametric algorithms which provide an estimated continuous label for the given unlabeled examples. It extends them to function induction algorithms that correspond to the minimization of a regularization criterion applied to an out-of-sample example, and happens to have the form of a Parzen windows regressor. The advantage of the extension is that it allows predicting the label for a new example without having to solve again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Experiments show that the extension works well, in the sense of predicting a label close to the one that would have been obtained if the test example had been included in the unlabeled set. This relatively efficient function induction procedure can also be used when n is large to approximate the solution by writing it only in terms of a kernel expansion with m << n terms, and reducing the linear system to m equations in m unknowns.},
topics={Kernel,Unsupervised},cat={T},
}

@TECHREPORT{tr1250,
       author = {Bengio, Yoshua and Monperrus, Martin},
        title = {Discovering shared structure in manifold learning},
       number = {1250},
         year = {2004},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr-tangent.pdf},
     abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local will suffer from at least four generic problems associated with (1) noise in the data, (2) curvature of the manifold, (3) dimensionality of the manifold, and (4) the presence of many manifolds with little data per manifold. This analysis suggests non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented. The function has parameters that are shared across space rather than estimated based on the local neighborhood, as in current non-parametric manifold learning algorithms. The results show clearly the advantages of this approach with respect to local manifold learning algorithms.},
topics={HighDimensional,Kernel,Unsupervised},cat={T},
}

@TECHREPORT{tr1252,
       author = {Bengio, Yoshua and Larochelle, Hugo},
        title = {Implantation et analyse d'un mod{\`{e}}le graphique {\`{a}} entra{\^{\i}}nement supervis{\'{e}}, semi-supervis{\'{e}} et non-supervis{\'{e}} pour la d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique},
       number = {1252},
         year = {2004},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1252.pdf},
     abstract = {La d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique est un sujet qui suscite beaucoup d’int{\'{e}}r{\^{e}}t dans la communaut{\'{e}} scientifique en apprentissage automatique. Quoique cette t{\^{a}}che ait {\'{e}}t{\'{e}} abord{\'{e}}e depuis les d{\'{e}}buts du traitement automatique de la langue, peu de progr{\`{e}}s ont {\'{e}}t{\'{e}} accomplis jusqu’{\`{a}} maintenant. Nous pr{\'{e}}sentons ici une application de d{\'{e}}sambigu{\"{\i}}sation bas{\'{e}}e sur un mod{\`{e}}le graphique probabiliste. Ce mod{\`{e}}le a {\'{e}}t{\'{e}} appris sur des donn{\'{e}}es {\'{e}}tiquet{\'{e}}es, non-{\'{e}}tiquet{\'{e}}es, et sur la hi{\'{e}}rarchie WordNet. Avec peu d’examples d’apprentissage, ses performances sont comparables {\`{a}} celles de l’algorithme de Bayes na{\"{\i}}f. Il pourrait {\'{e}}ventuellement {\^{e}}tre adapt{\'{e}} {\`{a}} des corpus bi-textes.},
topics={Unsupervised,Language},cat={T},
}

@TECHREPORT{tr1281,
       author = {Le Roux, Nicolas and Bengio, Yoshua},
        title = {Continuous Neural Networks},
       number = {1281},
         year = {2006},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/continuous_nnet_tr1281.pdf},
     abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several
ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based
learning. While having the same number of parameters as an ordinary neural network, its internal
structure suggests that it can represent some smooth functions much more compactly. Under mild
assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this
parametrization may help reducing the problem of saturation of the neurons. In a second approach, the
input-to-hidden weights are fully non-parametric, yielding a kernel machine for which we demonstrate
a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free
and still generalizes in spite of an absence of explicit regularization.},
cat={T},topics={Kernel,HighDimensional},
}

@TECHREPORT{tr1282,
       author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
        title = {Greedy Layer-Wise Training of Deep Networks},
       number = {1282},
         year = {2006},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dbn_supervised_tr1282.pdf},
     abstract = {Deep multi-layer neural networks have many levels of non-linearities, which allows them to potentially
represent very compactly highly non-linear and highly-varying functions. However, until recently it
was not clear how to train such deep networks, since gradient-based optimization starting from random
initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy
layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with
many layers of hidden causal variables. In the context of the above optimization problem, we study
this algorithm empirically and explore variants to better understand its success and extend it to cases
where the inputs are continuous or where the structure of the input distribution is not revealing enough
about the variable to be predicted in a supervised task.},
cat={T},topics={HighDimensional,Unsupervised},
}

@TECHREPORT{tr1283,
       author = {Carreau, Julie and Bengio, Yoshua},
        title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tail Data},
       number = {1283},
         year = {2006},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/fat_tails_tr1283.pdf},
     abstract = {We propose an estimator for the conditional density p(Y |X) that can adapt for asymmetric heavy tails
which might depend on X. Such estimators have important applications in finance and insurance. We
draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter
controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been
replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain
a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional
density estimator, the parameters of the mixture estimator can be seen as functions of X and these
functions learned. We show experimentally that this approach better models the conditional density in
terms of likelihood than compared competing algorithms: conditional mixture models with other types
of components and multivariate nonparametric models.},
cat={T},topics={Unsupervised,Mining},
}

@TECHREPORT{tr1284,
       author = {Larochelle, Hugo and Bengio, Yoshua},
        title = {Distributed Representation Prediction for Generalization to New Words},
       number = {1284},
         year = {2006},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dist_rep_pred_tr1284.pdf},
     abstract = {Learning distributed representations of symbols (e.g. words) has been used in several Natural Language Processing
systems. Such representations can capture semantic or syntactic similarities between words, which permit to fight
the curse of dimensionality when considering sequences of such words. Unfortunately, because these representations
are learned only for a previously determined vocabulary of words, it is not clear how to obtain representations
for new words. We present here an approach which gets around this problem by considering the distributed representations
as predictions from low-level or domain-knowledge features of words. We report experiments on a Part
Of Speech tagging task, which demonstrates the success of this approach in learning meaningful representations and
in providing improved accuracy, especially for new words.},
cat={T},topics={HighDimensional,Language},
}

@TECHREPORT{tr1285,
       author = {Grandvalet, Yves and Bengio, Yoshua},
        title = {Hypothesis Testing for Cross-Validation},
       number = {1285},
         year = {2006},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/xv_rho_stat_tr1285.pdf},
     abstract = {K-fold cross-validation produces variable estimates, whose variance cannot be estimated unbiasedly. However, in practice, one would like to
provide a figure related to the variability of this estimate. The first part
of this paper lists a series of restrictive assumptions (on the distribution of
cross-validation residuals) that allow to derive unbiased estimates. We exhibit three such estimates, corresponding to differing assumptions. Their
similar expressions however entail almost identical empirical behaviors.
Then, we look for a conservative test for detecting significant differences
in performances between two algorithms. Our proposal is based on the
derivation of the form of a t-statistic parametrized by the correlation of
residuals between each validation set. Its calibration is compared to the
usual t-test. While the latter is overconfident in concluding that differences are indeed significant, our test is bound to be more skeptical, with
smaller type-I error.},
cat={T},topics={ModelSelection,Comparative},
}

@TECHREPORT{tr1286,
       author = {Erhan, Dumitru and Bengio, Yoshua and {L'Heureux}, Pierre-Jean and Yue, Shi Yi},
        title = {Generalizing to a Zero-Data Task: a Computational Chemistry Case Study},
       number = {1286},
         year = {2006},
  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mt_qsar_tr1286.pdf},
     abstract = {We investigate the problem of learning several tasks simultaneously in order to transfer the acquired
knowledge to a completely new task for which no training data are available. Assuming that the tasks
share some representation that we can discover efficiently, such a scenario should lead to a better model of
the new task, as compared to the model that is learned by only using the knowledge of the new task. We
have evaluated several supervised learning algorithms in order to discover shared representations among
the tasks defined in a computational chemistry/drug discovery problem. We have cast the problem from
a statistical learning point of view and set up the general hypotheses that have to be tested in order
to validate the multi-task learning approach. We have then evaluated the performance of the learning
algorithms and showed that it is indeed possible to learn a shared representation of the tasks that allows
to generalize to a new task for which no training data are available. From a theoretical point of view,
our contribution also comprises a modification to the Support Vector Machine algorithm, which can
produce state-of-the-art results using multi-task learning concepts at its core. From a practical point
of view, our contribution is that this algorithm can be readily used by pharmaceutical companies for
virtual screening campaigns.},
cat={T},topics={MultiTask,Kernel,Bioinformatic},
}

@INPROCEEDINGS{Turian+al-2009,
     author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua},
      title = {Quadratic Features and Deep Architectures for Chunking},
  booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)},
       year = {2009},
      pages = {245--248},
  publisher = {Association for Computational Linguistics},
        url = {http://www.aclweb.org/anthology/N/N09/N09-2062},
   abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
}

@INPROCEEDINGS{Turian+al-2010,
     author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua and Roth, Dan},
      title = {A preliminary evaluation of word representations for named-entity recognition},
  booktitle = {NIPS Workshop on Grammar Induction, Representation of Language and Language Learning},
       year = {2009},
        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/wordrepresentations-ner.pdf},
   abstract = {We use different word representations as word features for a named-entity recognition (NER) system with a linear model. This work is part of a larger empirical survey, evaluating different word representations on different NLP tasks. We evaluate Brown clusters, Collobert and Weston (2008) embeddings, and HLBL (Mnih & Hinton, 2009) embeddings of words. All three representations improve accuracy on NER, with the Brown clusters providing a larger improvement than the two embeddings, and the HLBL embeddings more than the Collobert and Weston (2008) embeddings. We also discuss some of the practical issues in using embeddings as features. Brown clusters are simpler than embeddings because they require less hyperparameter tuning.}
}

@INPROCEEDINGS{Turian+Ratinov+Bengio-2010,
     author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
      title = {Word representations: A simple and general method for semi-supervised learning},
  booktitle = {Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics(ACL2010)},
       year = {2010},
      pages = {384--394},
  publisher = {Association for Computational Linguistics},
}

@INPROCEEDINGS{Vincent-Bengio-2003,
    author = {Vincent, Pascal and Bengio, Yoshua},
     title = {Manifold Parzen Windows},
      year = {2003},
     pages = {825--832},
  crossref = {NIPS15-shorter},
  abstract = {The similarity between objects is a fundamental element of many learning algorithms. Most non-parametric methods take this similarity to be fixed, but much recent work has shown the advantages of learning it, in particular to exploit the local invariances in the data or to capture the possibly non-linear manifold on which most of the data lies. We propose a new non-parametric kernel density estimation method which captures the local structure of an underlying manifold through the leading eigenvectors of regularized local covariance matrices. Experiments in density estimation show significant improvements with respect to Parzen density estimators. The density estimators can also be used within Bayes classifiers, yielding classification rates similar to {SVM}s and much superior to the Parzen classifier.},
topics={HighDimensional,Kernel,Unsupervised},cat={C},
}

@ARTICLE{Vincent-JMLR-2010,
    author = {Vincent, Pascal and Larochelle, Hugo and Lajoie, Isabelle and Bengio, Yoshua and Manzagol, Pierre-Antoine},
     title = {Stacked Denoising Autoencoders: learning useful representations in a deep network with a local denoising criterion},
   journal = {JMLR},
      year = {2010},
      note = {to appear}
}

@TECHREPORT{Vincent-TR1316,
       author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
        title = {Extracting and Composing Robust Features with Denoising Autoencoders},
       number = {1316},
         year = {2008},
  institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
          url = {http://www.iro.umontreal.ca/~vincentp/Publications/denoising_autoencoders_tr1316.pdf},
     abstract = {Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite.}
}

@PHDTHESIS{Vincent2003,
    author = {Vincent, Pascal},
     title = {Mod{\`{e}}les {\`{a}} Noyaux {\`{a}} Structure Locale},
      year = {2003},
    school = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
}

@ARTICLE{vincent:2001,
    author = {Vincent, Pascal and Bengio, Yoshua},
     title = {Kernel Matching Pursuit},
   journal = {Machine Learning},
      year = {2001},
  abstract = {We show how Matching Pursuit can be used to build kernel-based solutions to machine-learning problems while keeping control of the sparsity of the solution, and how it can be extended to use non-squared error loss functions. We also deriveMDL motivated generalization bounds for this type of algorithm. Finally, links to boosting algorithms and {RBF} training procedures, as well as extensive experimental comparison with {SVM}s are given, showing comparable results with typically sparser models.},
topics={HighDimensional,Kernel},cat={J},
}

@INPROCEEDINGS{VincentPLarochelleH2008,
    author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
     title = {Extracting and Composing Robust Features with Denoising Autoencoders},
      year = {2008},
     pages = {1096--1103},
  crossref = {ICML08-shorter},
  abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
}

@TECHREPORT{visualization_techreport,
       author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
        title = {Visualizing Higher-Layer Features of a Deep Network},
       number = {1341},
         year = {2009},
  institution = {University of Montreal},
     abstract = {Deep architectures have demonstrated state-of-the-art results in a variety of
settings, especially with vision datasets. Beyond the model definitions and the quantitative analyses, there is a need for qualitative comparisons of the solutions learned by various deep architectures. The goal of this paper is to find good qualitative interpretations of high level features represented by such models. To this end, we contrast and compare several techniques applied on Stacked Denoising Autoencoders and Deep Belief Networks, trained on several vision datasets. We show that, perhaps counter-intuitively, such interpretation is possible at the unit level, that it is simple to accomplish and that the results are consistent across various techniques. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work}
}

@INPROCEEDINGS{xAISTATS2009-short,
      title = {Proc. AISTATS'2009},
  booktitle = {Proc. AISTATS'2009},
       year = {2009}
}


@MISC{Yoshua+al-snowbird-2008,
        author = {Bengio, Yoshua and Larochelle, Hugo and Turian, Joseph},
         title = {Deep Woods},
          year = {2008},
  howpublished = {Poster presented at the Learning@Snowbird Workshop, Snowbird, USA, 2008}
}

@ARTICLE{Zaccaro-et-al-2005,
    author = {Zaccaro, Maria Clara and Boon, Hong and Pattarawarapan, Mookda and Xia, Zebin and Caron, Antoine and {L'Heureux}, Pierre-Jean and Bengio, Yoshua and Burgess, Kevin and Saragori, H. Uri},
     title = {Selective Small Molecule Peptidomimetic Ligands of TrkC and TrkA Receptors Afford Discrete or Complete Neurotrophic Activities},
   journal = {Chemistry \& Biology},
    volume = {12},
    number = {9},
      year = {2005},
     pages = {1015--1028}
}



crossreferenced publications: 
@INPROCEEDINGS{ICML09,
     editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
      title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
  booktitle = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
       year = {-1},
  publisher = {ACM}
}

@INPROCEEDINGS{NIPS7,
     editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
      title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
  booktitle = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS6,
     editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
      title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
  booktitle = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS8,
     editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
      title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
  booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
       year = {-1},
  publisher = {MIT Press}
}

@ARTICLE{JMLR,
   journal = {Journal of Machine Learning Research},
      year = {-1}
}

@INPROCEEDINGS{NIPS19,
     editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
      title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
  booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS10,
     editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
      title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
  booktitle = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS1,
     editor = {Touretzky, D. S.},
      title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
  booktitle = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{NIPS2,
     editor = {Touretzky, D. S.},
      title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
  booktitle = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{NIPS4,
     editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
      title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
  booktitle = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
       year = {-1},
  publisher = {Morgan Kaufmann}
}

@INPROCEEDINGS{NIPS12,
     editor = {Solla, S.A. and Leen, T. K.},
      title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
  booktitle = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS16,
     editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
      title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
  booktitle = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
       year = {-1}
}

@INPROCEEDINGS{NIPS22,
     editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
      title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
  booktitle = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
       year = {-1}
}

@INPROCEEDINGS{NIPS20,
     editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
      title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
  booktitle = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{xAISTATS2009,
      title = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
  booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
       year = {2009},
}

@INPROCEEDINGS{NIPS9,
     editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
      title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
  booktitle = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS17,
     editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
      title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
  booktitle = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
       year = {-1}
}

@PROCEEDINGS{ICML08,
     editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
      title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
  booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
       year = {2008},
  publisher = {ACM}
}

@PROCEEDINGS{ICML07,
     editor = {Ghahramani, Zoubin},
      title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
  booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
       year = {2007},
  publisher = {ACM}
}

@TECHREPORT{DIRO,
        title = {DIRO},
         year = {-1},
  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
}

@INPROCEEDINGS{NIPS18,
     editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
      title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
  booktitle = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{NIPS13,
     editor = {Leen, T. K. and Dietterich, T.G.},
      title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
  booktitle = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{ICML05,
     editor = {Raedt, Luc De and Wrobel, Stefan},
      title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
  booktitle = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
       year = {-1},
  publisher = {ACM}
}

@INPROCEEDINGS{ICML06,
     editor = {Cohen, William W. and Moore, Andrew},
      title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
  booktitle = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
       year = {-1},
  publisher = {ACM}
}

@INPROCEEDINGS{NIPS15,
     editor = {Becker, S. and Thrun, Sebastian},
      title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
  booktitle = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
       year = {-1},
  publisher = {MIT Press}
}

@INPROCEEDINGS{ICML01-shorter,
      title = {ICML'01},
  booktitle = {ICML'01},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{ICML02-shorter,
      title = {ICML'02},
  booktitle = {ICML'02},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{ICML03-shorter,
      title = {ICML'03},
  booktitle = {ICML'03},
       year = {-1},
  publisher = {AAAI Press}
}
@INPROCEEDINGS{ICML04-shorter,
      title = {ICML'04},
  booktitle = {ICML'04},
       year = {-1},
  publisher = {ACM}
}
@INPROCEEDINGS{ICML05-shorter,
      title = {ICML'05},
  booktitle = {ICML'05},
       year = {-1},
  publisher = {ACM}
}
@INPROCEEDINGS{ICML06-shorter,
      title = {ICML'06},
  booktitle = {ICML'06},
       year = {-1},
  publisher = {ACM}
}
@INPROCEEDINGS{ICML07-shorter,
      title = {ICML'07},
  booktitle = {ICML'07},
       year = {-1},
  publisher = {ACM}
}
@INPROCEEDINGS{ICML08-shorter,
      title = {ICML'08},
  booktitle = {ICML'08},
       year = {-1},
  publisher = {ACM}
}
@INPROCEEDINGS{ICML09-shorter,
      title = {ICML'09},
  booktitle = {ICML'09},
       year = {-1},
  publisher = {ACM}
}
@INPROCEEDINGS{ICML96-shorter,
      title = {ICML'96},
  booktitle = {ICML'96},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{ICML97-shorter,
      title = {ICML'97},
  booktitle = {ICML'97},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{ICML98-shorter,
      title = {ICML'98},
  booktitle = {ICML'98},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{ICML99-shorter,
      title = {ICML'99},
  booktitle = {ICML'99},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@ARTICLE{JMLR-shorter,
   journal = {JMLR},
      year = {-1}
}
@INPROCEEDINGS{NIPS1-shorter,
      title = {NIPS'88},
  booktitle = {NIPS 1},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{NIPS10-shorter,
      title = {NIPS'97},
  booktitle = {NIPS 10},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS11-shorter,
      title = {NIPS'98},
  booktitle = {NIPS 11},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS12-shorter,
      title = {NIPS'99},
  booktitle = {NIPS 12},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS13-shorter,
      title = {NIPS'00},
  booktitle = {NIPS 13},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS14-shorter,
      title = {NIPS'01},
  booktitle = {NIPS 14},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS15-shorter,
      title = {NIPS'02},
  booktitle = {NIPS 15},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS16-shorter,
      title = {NIPS'03},
  booktitle = {NIPS 16},
       year = {-1}
}
@INPROCEEDINGS{NIPS17-shorter,
      title = {NIPS'04},
  booktitle = {NIPS 17},
       year = {-1}
}
@INPROCEEDINGS{NIPS18-shorter,
      title = {NIPS'05},
  booktitle = {NIPS 18},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS19-shorter,
      title = {NIPS'06},
  booktitle = {NIPS 19},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS2-shorter,
      title = {NIPS'89},
  booktitle = {NIPS 2},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{NIPS20-shorter,
      title = {NIPS'07},
  booktitle = {NIPS 20},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS21-shorter,
      title = {NIPS'08},
  booktitle = {NIPS 21},
       year = {-1},
  publisher = {Nips Foundation (http://books.nips.cc)}
}
@INPROCEEDINGS{NIPS22-shorter,
      title = {NIPS'09},
  booktitle = {NIPS 22},
       year = {-1}
}
@INPROCEEDINGS{NIPS3-shorter,
      title = {NIPS'90},
  booktitle = {NIPS 3},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{NIPS4-shorter,
      title = {NIPS'91},
  booktitle = {NIPS 4},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{NIPS5-shorter,
      title = {NIPS'92},
  booktitle = {NIPS 5},
       year = {-1},
  publisher = {Morgan Kaufmann}
}
@INPROCEEDINGS{NIPS6-shorter,
      title = {NIPS'93},
  booktitle = {NIPS 6},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS7-shorter,
      title = {NIPS'94},
  booktitle = {NIPS 7},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS8-shorter,
      title = {NIPS'95},
  booktitle = {NIPS 8},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{NIPS9-shorter,
      title = {NIPS'96},
  booktitle = {NIPS 9},
       year = {-1},
  publisher = {MIT Press}
}
@INPROCEEDINGS{xAISTATS2009-shorter,
      title = {AISTATS'2009},
  booktitle = {AISTATS'2009},
       year = {-1}
}