changeset 500:8479bf822d0e

merge
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Tue, 01 Jun 2010 12:13:10 -0400
parents 2b58eda9fc08 (current diff) 7ff00c27c976 (diff)
children 5927432d8b8d
files writeup/ml.bib writeup/nips2010_submission.tex
diffstat 8 files changed, 31493 insertions(+), 25818 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Tue Jun 01 12:12:52 2010 -0400
+++ b/.hgignore	Tue Jun 01 12:13:10 2010 -0400
@@ -3,4 +3,7 @@
 
 *.pyc
 *~
-
+*.aux
+*.blg
+*.log
+*.bbl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/aigaion-shorter.bib	Tue Jun 01 12:13:10 2010 -0400
@@ -0,0 +1,4842 @@
+%Aigaion2 BibTeX export from LISA - Publications
+%Tuesday 01 June 2010 10:46:52 AM
+@INPROCEEDINGS{Attardi+al-2009,
+     author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
+   keywords = {classifier, dependency parsing, natural language, parser, perceptron},
+      title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron},
+  booktitle = {Proceeding of Evalita 2009},
+     series = {LNCS},
+       year = {2009},
+  publisher = {Springer},
+   abstract = {Abstract. DeSR is a statistical transition-based dependency parser which learns from annotated corpora which actions to perform for building parse trees while scanning a sentence. We describe recent improvements to the parser, in particular stacked parsing, exploiting a beam search strategy and using a Multilayer Perceptron classifier. For the Evalita 2009 Dependency Parsing task DesR was configured to use a combination of stacked parsers. The stacked combination achieved the best accuracy scores in both the main and pilot subtasks. The contribution to the result of various choices is analyzed, in particular for taking advantage of the peculiar features of the TUT Treebank.}
+}
+
+@INPROCEEDINGS{Bengio+al-2009,
+    author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
+     title = {Curriculum Learning},
+      year = {2009},
+  crossref = {ICML09-shorter},
+  abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved by using a particular curriculum, i.e., the selection and order of training examples. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
+}
+
+@TECHREPORT{Bengio+al-2009-TR,
+       author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
+        title = {Curriculum Learning},
+       number = {1330},
+         year = {2009},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).}
+}
+
+@MISC{Bengio+al-patent-2000,
+        author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and {LeCun}, Yann},
+         title = {Module for constructing trainable modular  network in which each module outputs and inputs data structured as a graph},
+          year = {2000},
+  howpublished = {U.S.  Patent 6,128,606, October 3}
+}
+
+@MISC{Bengio+al-patent-2001,
+        author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and G. Howard, Paul},
+         title = {Z-Coder : a fast adaptive binary arithmetic  coder},
+          year = {2001},
+  howpublished = {U.S. Patent 6,188,334, February 13, 2001, along with patents 6,225,925,  6,281,817, and 6,476,740}
+}
+
+@MISC{Bengio+al-patent-94,
+        author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
+         title = {Visitor Registration System Using Automatic Handwriting Recognition},
+          year = {1994},
+  howpublished = {Patent submitted in the U.S.A. in October 1994, submission number 1-16-18-1}
+}
+
+@INCOLLECTION{Bengio+al-spectral-2006,
+     author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
+     editor = {Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti},
+      title = {Spectral Dimensionality Reduction},
+  booktitle = {Feature Extraction, Foundations and Applications},
+       year = {2006},
+  publisher = {Springer},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/eigenfn_chapter.pdf},
+   abstract = {In this chapter, we study and put under a common framework a number
+of non-linear dimensionality reduction methods, such as Locally Linear Embedding,
+Isomap, Laplacian eigenmaps and kernel {PCA}, which are based
+on performing an eigen-decomposition (hence the name "spectral"). That
+framework also includes classical methods such as {PCA} and metric multidimensional
+scaling ({MDS}). It also includes the data transformation step used
+in spectral clustering. We show that in all of these cases the learning algorithm
+estimates the principal eigenfunctions of an operator that depends on
+the unknown data density and on a kernel that is not necessarily positive
+semi-definite. This helps to generalize some of these algorithms so as to predict
+an embedding for out-of-sample examples without having to retrain the
+model. It also makes it more transparent what these algorithm are minimizing
+on the empirical data and gives a corresponding notion of generalization
+error.},
+cat={B},topics={HighDimensional,Kernel,Unsupervised},
+}
+
+@INCOLLECTION{Bengio+al-ssl-2006,
+     author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
+     editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
+      title = {Label Propagation and Quadratic Criterion},
+  booktitle = {Semi-Supervised Learning},
+       year = {2006},
+      pages = {193--216},
+  publisher = {{MIT} Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_ssl.pdf},
+   abstract = {Various graph-based algorithms for semi-supervised learning have been proposed in
+the recent literature. They rely on the idea of building a graph whose nodes are
+data points (labeled and unlabeled) and edges represent similarities between points.
+Known labels are used to propagate information through the graph in order to label
+all nodes. In this chapter, we show how these different algorithms can be cast into
+a common framework where one minimizes a quadratic cost criterion whose closed-form solution is found by solving a linear system of size n (total number of data
+points). The cost criterion naturally leads to an extension of such algorithms to
+the inductive setting, where one obtains test samples one at a time: the derived
+induction formula can be evaluated in O(n) time, which is much more efficient
+than solving again exactly the linear system (which in general costs O(kn2) time
+for a sparse graph where each data point has k neighbors). We also use this inductive
+formula to show that when the similarity between points satisfies a locality property,
+then the algorithms are plagued by the curse of dimensionality, with respect to the
+dimensionality of an underlying manifold.},
+cat={B},topics={Unsupervised},
+}
+
+@TECHREPORT{Bengio+al-treecurse-2007,
+       author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
+        title = {Decision Trees do not Generalize to New Variations},
+       number = {1304},
+         year = {2007},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf}
+}
+
+@INPROCEEDINGS{Bengio+Bengio96,
+     author = {Bengio, Samy and Bengio, Yoshua},
+     editor = {Xu, L.},
+      title = {An {EM} Algorithm for Asynchronous Input/Output Hidden {M}arkov Models},
+  booktitle = {International Conference On Neural Information Processing},
+       year = {1996},
+      pages = {328--334},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iconip96.pdf},
+   abstract = {In learning tasks in which input sequences are mapped to output sequences, it is often the case that the input and output sequences are not synchronous. For example, in speech recognition, acoustic sequences are longer than phoneme sequences. Input/Output Hidden {Markov} Models have already been proposed to represent the distribution of an output sequence given an input sequence of the same length. We extend here this model to the case of asynchronous sequences_ and show an Expectation-Maximization algorithm for training such models.},
+topics={Markov},cat={C},
+}
+
+@INCOLLECTION{Bengio+chapter2007,
+     author = {Bengio, Yoshua and {LeCun}, Yann},
+     editor = {Bottou, {L{\'{e}}on} and Chapelle, Olivier and DeCoste, D. and Weston, J.},
+      title = {Scaling Learning Algorithms towards {AI}},
+  booktitle = {Large Scale Kernel Machines},
+       year = {2007},
+  publisher = {MIT Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+lecun_chapter2007.pdf},
+   abstract = {One long-term goal of machine learning research is to produce methods that
+are applicable to highly complex tasks, such as perception (vision, audition), reasoning,
+intelligent control, and other artificially intelligent behaviors. We argue
+that in order to progress toward this goal, the Machine Learning community must
+endeavor to discover algorithms that can learn highly complex functions, with minimal
+need for prior knowledge, and with minimal human intervention. We present
+mathematical and empirical evidence suggesting that many popular approaches
+to non-parametric learning, particularly kernel methods, are fundamentally limited
+in their ability to learn complex high-dimensional functions. Our analysis
+focuses on two problems. First, kernel machines are shallow architectures, in
+which one large layer of simple template matchers is followed by a single layer
+of trainable coefficients. We argue that shallow architectures can be very inefficient
+in terms of required number of computational elements and examples. Second,
+we analyze a limitation of kernel machines with a local kernel, linked to the
+curse of dimensionality, that applies to supervised, unsupervised (manifold learning)
+and semi-supervised kernel machines. Using empirical results on invariant
+image recognition tasks, kernel methods are compared with deep architectures, in
+which lower-level features or concepts are progressively combined into more abstract
+and higher-level representations. We argue that deep architectures have the
+potential to generalize in non-local ways, i.e., beyond immediate neighbors, and
+that this is crucial in order to make progress on the kind of complex tasks required
+for artificial intelligence.},
+cat={B},topics={HighDimensional},
+}
+
+@ARTICLE{Bengio+Delalleau-2009,
+    author = {Bengio, Yoshua and Delalleau, Olivier},
+     title = {Justifying and Generalizing Contrastive Divergence},
+   journal = {Neural Computation},
+    volume = {21},
+    number = {6},
+      year = {2009},
+     pages = {1601--1621},
+  abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its residual term converges to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence (CD) estimator of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators.  The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain. We present theoretical and empirical evidence linking the number of Gibbs steps $k$ and the magnitude of the RBM parameters to the bias in the CD estimator. These experiments also suggest that the sign of the CD estimator is correct most of the time, even when the bias is large, so that CD-$k$ is a good descent direction even for small $k$.}
+}
+
+@TECHREPORT{Bengio+Delalleau-TR2007,
+       author = {Bengio, Yoshua and Delalleau, Olivier},
+     keywords = {Contrastive Divergence, Restricted {Boltzmann} Machine},
+        title = {Justifying and Generalizing Contrastive Divergence},
+       number = {1311},
+         year = {2007},
+  institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its terms converge to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence approximation of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain.}
+}
+
+@INPROCEEDINGS{Bengio+DeMori88,
+     author = {Bengio, Yoshua and De Mori, Renato},
+      title = {Use of neural networks for the recognition of place of articulation},
+  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
+       year = {1988},
+      pages = {103--106},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio+DeMori89,
+     author = {Bengio, Yoshua and Cardin, Regis and Cosi, Piero and De Mori, Renato},
+      title = {Speech coding with multi-layer networks},
+  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
+       year = {1989},
+      pages = {164--167},
+topics={Speech},cat={C},
+}
+
+@INCOLLECTION{Bengio+DeMori90a,
+     author = {Bengio, Yoshua and De Mori, Renato},
+     editor = {Sethi, I. K. and Jain, A. K.},
+      title = {Connectionist models and their application to automatic speech recognition},
+  booktitle = {Artificial Neural Networks and Statistical Pattern Recognition: Old and New Connections},
+       year = {1990},
+      pages = {175--192},
+  publisher = {Elsevier, Machine Intelligence and Pattern Recognition Series},
+topics={Speech},cat={B},
+}
+
+@ARTICLE{Bengio+Frasconi-jair95,
+    author = {Bengio, Yoshua and Frasconi, Paolo},
+     title = {Diffusion of Context and Credit Information in {M}arkovian Models},
+   journal = {Journal of Artificial Intelligence Research},
+    volume = {3},
+      year = {1995},
+     pages = {249--270},
+  abstract = {This paper studies the problem of ergodicity of transition probability matrices in {Markovian} models, such as hidden {Markov} models ({HMM}s), and how it makes very difficult the task of learning to represent long-term context for sequential data. This phenomenon hurts the forward propagation of long-term context information, as well as learning a hidden state representation to represent long-term context, which depends on propagating credit information backwards in time. Using results from {Markov} chain theory, we show that this problem of diffusion of context and credit is reduced when the transition probabilities approach 0 or 1, i.e., the transition probability matrices are sparse and the model essentially deterministic. The results found in this paper apply to learning approaches based on continuous optimization, such as gradient descent and the Baum-Welch algorithm.},
+topics={Markov,LongTerm},cat={J},
+}
+
+@INPROCEEDINGS{Bengio+Frasconi-nips7-diffuse,
+    author = {Bengio, Yoshua and Frasconi, Paolo},
+     title = {Diffusion of Credit in {M}arkovian Models},
+      year = {1995},
+     pages = {553--560},
+  crossref = {NIPS7-shorter},
+  abstract = {This paper studies the problem of diffusion in {Markovian} models, such as hidden {Markov} models ({HMM}s) and how it makes very difficult the task of learning of long-term dependencies in sequences. Using results from {Markov} chain theory, we show that the problem of diffusion is reduced if the transition probabilities approach 0 or 1. Under this condition, standard {HMM}s have very limited modeling capabilities, but input/output {HMM}s can still perform interesting computations.},
+topics={Markov},cat={C},
+}
+
+@INPROCEEDINGS{Bengio+Frasconi-nips7-iohmms,
+    author = {Bengio, Yoshua and Frasconi, Paolo},
+     title = {An Input/Output {HMM} Architecture},
+      year = {1995},
+     pages = {427--434},
+  crossref = {NIPS7-shorter},
+  abstract = {We introduce a recurrent architecture having a modular structure and we formulate a training procedure based on the {EM} algorithm. The resulting model has similarities to hidden {Markov} models, but supports recurrent networks processing style and allows to exploit the supervised learning paradigm while using maximum likelihood estimation.},
+topics={Markov},cat={C},
+}
+
+@INPROCEEDINGS{Bengio+Frasconi-nips94,
+    author = {Bengio, Yoshua and Frasconi, Paolo},
+     title = {Credit Assignment through Time: Alternatives to Backpropagation},
+      year = {1994},
+     pages = {75--82},
+  crossref = {NIPS6-shorter},
+  abstract = {Learning to recognize or predict sequences using long-term context has many applications. However, practical and theoretical problems are found in training recurrent neural networks to perform tasks in which input/output dependencies span long intervals. Starting from a mathematical analysis of the problem, we consider and compare alternative algorithms and architectures on tasks for which the span of the input/output dependencies can be controlled. Results on the new algorithms show performance qualitatively superior to that obtained with backpropagation.},
+topics={LongTerm},cat={C},
+}
+
+@ARTICLE{Bengio+Pouliot90,
+    author = {Bengio, Yoshua and Pouliot, Yannick},
+     title = {Efficient recognition of immunoglobulin domains from amino-acid sequences using a neural network},
+   journal = {Computer Applications in the Biosciences},
+    volume = {6},
+    number = {2},
+      year = {1990},
+     pages = {319--324},
+topics={Bioinformatic,PriorKnowledge},cat={J},
+}
+
+@INPROCEEDINGS{Bengio+Senecal-2003,
+     author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
+      title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
+  booktitle = {Proceedings of the conference on Artificial Intelligence and Statistics (AISTATS)},
+       year = {2003},
+   abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary.  Inspired by the contrastive divergence model, we propose and evaluate sampling-based methods which require network passes only for the observed "positive example'' and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.}
+}
+
+@ARTICLE{Bengio+Senecal-2008,
+    author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
+  keywords = {Energy-based models, fast training, importance sampling, language modeling, Monte Carlo methods, probabilistic neural networks},
+     title = {Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model},
+   journal = {IEEE Transactions on Neural Networks},
+    volume = {19},
+    number = {4},
+      year = {2008},
+     pages = {713--722},
+  abstract = {Previous work on statistical language modeling has shown that it is possible to train a feedforward neural network to approximate probabilities over sequences of words, resulting in significant error reduction when compared to standard baseline models based on -grams. However, training the neural network model with the maximum-likelihood criterion requires computations proportional to the number of words in the vocabulary. In this paper, we introduce adaptive importance sampling as a way to accelerate training of the model. The idea is to use an adaptive n-gram model to track the conditional distributions produced by the neural network. We show that a very significant speedup can be obtained on standard problems.}
+}
+
+@INCOLLECTION{Bengio-2007,
+     author = {Bengio, Yoshua},
+     editor = {Cisek, Paul and Kalaska, John and Drew, Trevor},
+      title = {On the Challenge of Learning Complex Functions},
+  booktitle = {Computational Neuroscience: Theoretical Insights into Brain Function},
+     series = {Progress in Brain Research},
+       year = {2007},
+  publisher = {Elsevier},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/PBR_chapter.pdf},
+   abstract = {A common goal of computational neuroscience and of artificial intelligence
+research based on statistical learning algorithms is the discovery and
+understanding of computational principles that could explain what we
+consider adaptive intelligence, in animals as well as in machines. This
+chapter focuses on what is required for the learning of complex behaviors. We
+believe it involves the learning of highly varying functions, in a
+mathematical sense. We bring forward two types of arguments which convey
+the message that many currently popular machine learning approaches to
+learning flexible functions have fundamental limitations that render them 
+inappropriate for learning highly varying functions. The first issue
+concerns the representation of such functions with what we call shallow model
+architectures.  We discuss limitations of shallow architectures, such as
+so-called kernel machines, boosting algorithms, and one-hidden-layer artificial neural
+networks.  The second issue is more focused and concerns kernel machines
+with a local kernel (the type used most often in practice),
+that act like a collection of template matching units. We present
+mathematical results on such computational architectures showing that they
+have a limitation similar to those already proved for older non-parametric
+methods, and connected to the so-called curse of dimensionality. Though it has long
+been believed that efficient learning in deep architectures is difficult,
+recently proposed computational principles for learning in deep architectures
+may offer a breakthrough.}
+}
+
+@ARTICLE{Bengio-2009,
+    author = {Bengio, Yoshua},
+     title = {Learning deep architectures for {AI}},
+   journal = {Foundations and Trends in Machine Learning},
+    volume = {2},
+    number = {1},
+      year = {2009},
+     pages = {1--127},
+      note = {Also published as a book. Now Publishers, 2009.},
+  abstract = {Theoretical results suggest that in order to learn the kind of
+complicated functions that can represent high-level abstractions (e.g. in
+vision, language, and other AI-level tasks), one may need {\insist deep
+architectures}. Deep architectures are composed of multiple levels of non-linear
+operations, such as in neural nets with many hidden layers or in complicated
+propositional formulae re-using many sub-formulae. Searching the
+parameter space of deep architectures is a difficult task, but
+learning algorithms such as those for Deep Belief Networks have recently been proposed
+to tackle this problem with notable success, beating the state-of-the-art
+in certain areas. This paper discusses the motivations and principles regarding 
+learning algorithms for deep architectures,  in particular those exploiting as
+building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines,
+used to construct deeper models such as Deep Belief Networks.}
+}
+
+@TECHREPORT{Bengio-96-TR,
+       author = {Bengio, Yoshua},
+        title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
+       number = {\#1019},
+         year = {1996},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengioy_TR1019.pdf},
+     abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a rediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest. Experiments were performed on portfolio selection with 35 Canadian stocks.},
+topics={Finance,Discriminant},cat={T},
+}
+
+@BOOK{bengio-book96,
+     author = {Bengio, Yoshua},
+      title = {Neural Networks for Speech and Sequence Recognition},
+       year = {1996},
+  publisher = {International Thompson Computer Press},
+topics={Speech},cat={B},
+}
+
+@TECHREPORT{Bengio-convex-05,
+       author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
+        title = {Convex neural networks},
+       number = {1263},
+         year = {2005},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1263.pdf},
+     abstract = {Convexity has recently received a lot of attention in the machine learning community, and the lack of convexity has been seen as a major disadvantage of many learning algorithms, such as multi-layer artificial neural networks. We how that training multi-layer neural networks in which the number of hidden units is learned can be viewed as a convex optimization problem. This problem involves an infinite number of variables, but can be solved by incrementally inserting a hidden unit at a time, each time finding a linear classifiers that minimizes a weighted sum of errors.},
+topics={Boosting},cat={T},
+}
+
+@ARTICLE{Bengio-decision-trees10,
+    author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
+     title = {Decision Trees do not Generalize to New Variations},
+   journal = {Computational Intelligence},
+      year = {2010},
+      note = {To appear}
+}
+
+@ARTICLE{bengio-demori89,
+    author = {Bengio, Yoshua and De Mori, Renato},
+     title = {Use of multilayer networks for the recognition of phonetic features and phonemes},
+   journal = {Computational Intelligence},
+    volume = {5},
+      year = {1989},
+     pages = {134--141},
+topics={Speech},cat={J},
+}
+
+@ARTICLE{Bengio-eigen-NC2004,
+    author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie},
+     title = {Learning eigenfunctions links spectral embedding and kernel {PCA}},
+   journal = {Neural Computation},
+    volume = {16},
+    number = {10},
+      year = {2004},
+     pages = {2197--2219},
+  abstract = {In this paper, we show a direct relation between spectral embedding methods and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of an operator defined from a kernel and the unknown data generating density. Whereas spectral embedding methods only provided coordinates for the training points, the analysis justifies a simple extension to out-of-sample examples (the Nystr{\"{o}}m formula) for Multi-Dimensional Scaling, spectral clustering, Laplacian eigenmaps, Locally Linear Embedding ({LLE}) and Isomap. The analysis provides, for all such spectral embedding methods, the definition of a loss function, whose empirical average is minimized by the traditional algorithms. The asymptotic expected value of that loss defines a generalization performance and clarifies what these algorithms are trying to learn. Experiments with {LLE}, Isomap, spectral clustering and {MDS} show that this out-of-sample embedding formula generalizes well, with a level of error comparable to the effect of small perturbations of the training set on the embedding.},
+topics={HighDimensional,Kernel,Unsupervised},cat={J},
+}
+
+@INPROCEEDINGS{Bengio-Gingras-nips8,
+    author = {Bengio, Yoshua and Gingras, Fran{\c c}ois},
+     title = {Recurrent Neural Networks for Missing or Asynchronous Data},
+      year = {1996},
+     pages = {395--401},
+  crossref = {NIPS8-shorter},
+  abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more discriminant approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},
+topics={Finance,Missing},cat={C},
+}
+
+@ARTICLE{Bengio-Grandvalet-JMLR-04,
+    author = {Bengio, Yoshua and Grandvalet, Yves},
+     title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
+    volume = {5},
+      year = {2004},
+     pages = {1089--1105},
+  crossref = {JMLR-shorter},
+  abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
+topics={Comparative},cat={J},
+}
+
+@TECHREPORT{bengio-hyper-TR99,
+       author = {Bengio, Yoshua},
+        title = {Continuous Optimization of Hyper-Parameters},
+       number = {1144},
+         year = {1999},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyperTR.pdf},
+     abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves (1) “training errors” on each training example and (2) some hyper-parameters, which are kept fixed during this minimization.	 When there is only a single hyper-parameter one can easily explore how its value aects a model selection criterion (that is not the same as the training criterion, and is used to select hyper-parameters).	In this paper we present a methodology to select many hyper-parameters that is based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. We first consider the case of a training criterion that is quadratic in the parameters. In that case, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient, but this formula requires the computation of second derivatives of the training criterion},
+topics={ModelSelection},cat={T},
+}
+
+@INPROCEEDINGS{Bengio-icnn93,
+     author = {Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice},
+      title = {The problem of learning long-term dependencies in recurrent networks},
+  booktitle = {IEEE International Conference on Neural Networks},
+       year = {1993},
+      pages = {1183--1195},
+  publisher = {IEEE Press},
+       note = {(invited paper)},
+topics={LongTerm},cat={C},
+}
+
+@ARTICLE{Bengio-ijprai93,
+    author = {Bengio, Yoshua},
+     title = {A Connectionist Approach to Speech Recognition},
+   journal = {International Journal on Pattern Recognition and Artificial Intelligence},
+    volume = {7},
+    number = {4},
+      year = {1993},
+     pages = {647--668},
+  abstract = {The task discussed in this paper is that of learning to map input sequences to output sequences. In particular, problems of phoneme recognition in continuous speech are considered, but most of the discussed techniques could be applied to other tasks, such as the recognition of sequences of handwritten characters. The systems considered in this paper are based on connectionist models, or artificial neural networks, sometimes combined with statistical techniques for recognition of sequences of patterns, stressing the integration of prior knowledge and learning. Different architectures for sequence and speech recognition are reviewed, including recurrent networks as well as hybrid systems involving hidden {Markov} models.},
+topics={PriorKnowledge,Speech},cat={J},
+}
+
+@TECHREPORT{Bengio-iohmms-TR99,
+       author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
+        title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
+       number = {1146},
+         year = {1999},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iohmms-returnsTR.pdf},
+     abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditionned on an input sequence. For example, these conditional distributions can be linear, logistic, or non-linear (using for example multi-layer neural networks). We compare the generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments are performed on modeling the returns of market and sector indices. Note that the unconditional Gaussian estimates the first moment with the historical average. The results show that, although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s yielded significantly better performance, as measured by the out-of-sample likelihood.},
+topics={Markov},cat={T},
+}
+
+@ARTICLE{bengio-lauzon-ducharme:2000,
+    author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean},
+     title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series},
+   journal = {IEEE Transaction on Neural Networks},
+    volume = {12},
+    number = {1},
+      year = {2001},
+     pages = {113--123},
+  abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditioned on an input sequence. For example, these conditional distributions can be logistic, or non-linear (using for example multi-layer neural networks).  We compare generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments compare these models on predicting the conditional density of returns of market sector indices. Note that the unconditional Gaussian estimates the first moment the historical average. The results show that_ although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s significantly better performance, as estimated by the out-of-sample likelihood.},
+topics={Markov,Finance},cat={J},
+}
+
+@INPROCEEDINGS{bengio-lecun-94,
+     author = {Bengio, Yoshua and {LeCun}, Yann},
+      title = {Word normalization for on-line handwritten word recognition},
+  booktitle = {Proc. of the International Conference on Pattern Recognition},
+     volume = {II},
+       year = {1994},
+      pages = {409--413},
+  publisher = {IEEE},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-norm.ps},
+   abstract = {We introduce a new approach to normalizing words written with an electronic stylus that applies to all styles of handwriting (upper case, lower case, printed, cursive, or mixed). A geometrical model of the word spatial structure is fitted to the pen trajectory using the {EM} algorithm. The fitting process maximizes the likelihood of the trajectory given the model and a set a priors on its parameters. The method was evaluated and integrated to a recognition system that combines neural networks and hidden {Markov} models.},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@TECHREPORT{Bengio-localfailure-TR-2005,
+       author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
+        title = {The Curse of Dimensionality for Local Kernel Machines},
+       number = {1258},
+         year = {2005},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1258.pdf},
+     abstract = {We present a series of theoretical arguments supporting the claim that a large class of modern learning algorithms based on local kernels are sensitive to the curse of dimensionality. These include local manifold learning algorithms such as Isomap and {LLE}, support vector classifiers with Gaussian or other local kernels, and graph-based semisupervised learning algorithms using a local similarity function. These algorithms are shown to be local in the sense that crucial properties of the learned function at x depend mostly on the neighbors of x in the training set. This makes them sensitive to the curse of dimensionality, well studied for classical non-parametric statistical learning. There
+is a large class of data distributions for which non-local solutions could be expressed compactly and potentially be learned with few examples, but which will require a large number of local bases and therefore a large number of training examples when using a local learning algorithm.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@INPROCEEDINGS{Bengio-nips-2006,
+    author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
+     title = {Greedy Layer-Wise Training of Deep Networks},
+      year = {2007},
+     pages = {153--160},
+  crossref = {NIPS19-shorter},
+  abstract = {Complexity theory of circuits strongly suggests that deep architectures can be
+much more efficient (sometimes exponentially) than shallow architectures,
+in terms of computational elements required to represent some functions.
+Deep multi-layer neural networks have many levels of non-linearities
+allowing them to compactly represent highly non-linear and
+highly-varying functions. However, until recently it was not clear how
+to train such deep networks, since gradient-based
+optimization starting from random initialization appears to often get stuck
+in poor solutions. Hinton et al. recently introduced
+a greedy layer-wise unsupervised learning algorithm for Deep Belief
+Networks (DBN), a generative model with many layers of hidden causal
+variables. In the context of the above optimization problem,
+we study this algorithm empirically and explore variants to
+better understand its success and extend it to cases where the inputs are
+continuous or where the structure of the input distribution is not
+revealing enough about the variable to be predicted in a supervised task.
+Our experiments also confirm the hypothesis that the greedy
+layer-wise unsupervised training strategy mostly helps the
+optimization, by initializing weights in a region near a
+good local minimum, giving rise to internal distributed representations
+that are high-level abstractions of the input, bringing better generalization.}
+}
+
+@INPROCEEDINGS{Bengio-nips10,
+    author = {Bengio, Yoshua and Bengio, Samy and Isabelle, Jean-Fran{\c c}ois and Singer, Yoram},
+     title = {Shared Context Probabilistic Transducers},
+      year = {1998},
+  crossref = {NIPS10-shorter},
+  abstract = {Recently, a model for supervised learning of probabilistic transducers represented by suffix trees was introduced. However, this algorithm tends to build very large trees, requiring very large amounts of computer memory. In this paper, we propose a new, more compact, transducer model in which one shares the parameters of distributions associated to contexts yielding similar conditional output distributions. We illustrate the advantages of the proposed algorithm with comparative experiments on inducing a noun phrase recognizer.},
+topics={HighDimensional},cat={C},
+}
+
+@TECHREPORT{Bengio-NLMP-TR-2005,
+       author = {Bengio, Yoshua and Larochelle, Hugo},
+        title = {Non-Local Manifold Parzen Windows},
+       number = {1264},
+         year = {2005},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/NLMP-techreport.pdf},
+     abstract = {In order to escape from the curse of dimensionality, we claim that one can learn non-local functions, in the sense that the value and shape of the learned function at x must be inferred using examples that may be far from x. With this objective, we present a non-local non-parametric density estimator. It builds upon previously proposed Gaussian mixture models with regularized covariance matrices to take into account the local shape of the manifold. It also builds upon recent work on non-local estimators of the tangent plane of a manifold, which are able to generalize in places with little training data, unlike traditional, local, non-parametric models.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@INPROCEEDINGS{Bengio-nncm96,
+     author = {Bengio, Yoshua},
+     editor = {Weigend, A.S. and Abu-Mostafa, Y.S. and Refenes, A. -P. N.},
+      title = {Training A Neural Network with a Financial Criterion Rather than a Prediction Criterion},
+  booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets ({NNCM}-96)},
+       year = {1997},
+      pages = {433--443},
+  publisher = {World Scientific},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nncm.pdf},
+   abstract = {A common approach to quantitative decision taking with financial time-series is to train a model using a prediction criterion (e.g., squared error). We find on a portfolio selection problem that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest, with a differentiable decision module.},
+topics={Finance,PriorKnowledge,Discriminant},cat={C},
+}
+
+@TECHREPORT{Bengio-NonStat-Hyper-TR,
+       author = {Bengio, Yoshua and Dugas, Charles},
+        title = {Learning Simple Non-Stationarities with Hyper-Parameters},
+       number = {1145},
+         year = {1999},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nonstatTR.pdf},
+     abstract = {We consider sequential data that is sampled from an unknown process, so that the data are not necessarily i.i.d.. Most approaches to machine learning assume that data points are i.i.d.. Instead we consider a measure of generalization that does not make this assumption, and we consider in this context a recently proposed approach to optimizing hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to hyper-parameters. Here we use hyper-parameters that control a function that gives different weights to different time steps in the historical data sequence. The approach is successfully applied to modeling thev olatility of stock returns one month ahead. Comparative experiments with more traditional methods are presented.},
+topics={ModelSelection,Finance},cat={T},
+}
+
+@ARTICLE{Bengio-scholarpedia-2007,
+    author = {Bengio, Yoshua},
+     title = {Neural net language models},
+   journal = {Scholarpedia},
+    volume = {3},
+    number = {1},
+      year = {2008},
+     pages = {3881},
+  abstract = {A language model is a function, or an algorithm for learning such a function, that captures the salient statistical characteristics of the distribution of sequences of words in a natural language, typically allowing one to make probabilistic predictions of the next word given preceding ones.
+
+A neural network language model is a language model based on Neural Networks , exploiting their ability to learn distributed representations to reduce the impact of the curse of dimensionality.
+
+In the context of learning algorithms, the curse of dimensionality refers to the need for huge numbers of training examples when learning highly complex functions. When the number of input variables increases, the number of required examples can grow exponentially. The curse of dimensionality arises when a huge number of different combinations of values of the input variables must be discriminated from each other, and the learning algorithm needs at least one example per relevant combination of values. In the context of language models, the problem comes from the huge number of possible sequences of words, e.g., with a sequence of 10 words taken from a vocabulary of 100,000 there are 10^{50} possible sequences...
+
+A distributed representation of a symbol is a tuple (or vector) of features which characterize the meaning of the symbol, and are not mutually exclusive. If a human were to choose the features of a word, he might pick grammatical features like gender or plurality, as well as semantic features like animate" or invisible. With a neural network language model, one relies on the learning algorithm to discover these features, and the features are continuous-valued (making the optimization problem involved in learning much simpler).
+
+The basic idea is to learn to associate each word in the dictionary with a continuous-valued vector representation. Each word corresponds to a point in a feature space. One can imagine that each dimension of that space corresponds to a semantic or grammatical characteristic of words. The hope is that functionally similar words get to be closer to each other in that space, at least along some directions. A sequence of words can thus be transformed into a sequence of these learned feature vectors. The neural network learns to map that sequence of feature vectors to a prediction of interest, such as the probability distribution over the next word in the sequence. What pushes the learned word features to correspond to a form of semantic and grammatical similarity is that when two words are functionally similar, they can be replaced by one another in the same context, helping the neural network to compactly represent a function that makes good predictions on the training set, the set of word sequences used to train the model.
+
+The advantage of this distributed representation approach is that it allows the model to generalize well to sequences that are not in the set of training word sequences, but that are similar in terms of their features, i.e., their distributed representation. Because neural networks tend to map nearby inputs to nearby outputs, the predictions corresponding to word sequences with similar features are mapped to similar predictions. Because many different combinations of feature values are possible, a very large set of possible meanings can be represented compactly, allowing a model with a comparatively small number of parameters to fit a large training set.}
+}
+
+@TECHREPORT{Bengio-TR1312,
+       author = {Bengio, Yoshua},
+        title = {Learning deep architectures for AI},
+       number = {1312},
+         year = {2007},
+  institution = {Dept. IRO, Universite de Montreal},
+         note = {Preliminary version of journal article with the same title appearing in Foundations and Trends in Machine Learning (2009)},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1312.pdf},
+     abstract = {Theoretical results strongly suggest that in order to learn the kind of
+complicated functions that can represent high-level abstractions (e.g. in
+vision, language, and other AI-level tasks), one may need deep
+architectures. Deep architectures are composed of multiple levels of non-linear
+operations, such as in neural nets with many hidden layers. Searching the
+parameter space of deep architectures is a difficult optimization task, but
+learning algorithms such as those for Deep Belief Networks have recently been proposed
+to tackle this problem with notable success, beating the state-of-the-art
+in certain areas. This paper discusses the motivations and principles regarding 
+learning algorithms for deep architectures and in particular for those based
+on unsupervised learning such as Deep Belief Networks, using as building
+blocks single-layer models such as Restricted {Boltzmann} Machines.}
+}
+
+@ARTICLE{Bengio-trnn94,
+    author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
+     title = {Learning Long-Term Dependencies with Gradient Descent is Difficult},
+   journal = {IEEE Transactions on Neural Networks},
+    volume = {5},
+    number = {2},
+      year = {1994},
+     pages = {157--166},
+  abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
+optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J},
+}
+
+@INPROCEEDINGS{Bengio-wirn93,
+     author = {Bengio, Yoshua and Frasconi, Paolo and Gori, Marco and Soda, G.},
+     editor = {Caianello, E.},
+      title = {Recurrent Neural Networks for Adaptive Temporal Processing},
+  booktitle = {Proc. of the 6th Italian Workshop on Neural Networks, WIRN-93},
+       year = {1993},
+      pages = {1183--1195},
+  publisher = {World Scientific Publ.},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rnn_review93.ps},
+topics={LongTerm},cat={C},
+}
+
+@ARTICLE{Bengio2000c,
+    author = {Bengio, Yoshua},
+     title = {Gradient-Based Optimization of Hyperparameters},
+   journal = {Neural Computation},
+    volume = {12},
+    number = {8},
+      year = {2000},
+     pages = {1889--1900},
+  abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
+topics={ModelSelection},cat={J},
+}
+
+@ARTICLE{Bengio89a,
+    author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Merlo, Ettore},
+     title = {Programmable execution of multi-layered networks for automatic speech recognition},
+   journal = {Communications of the Association for Computing Machinery},
+    volume = {32},
+    number = {2},
+      year = {1989},
+     pages = {195--199},
+topics={Speech},cat={J},
+}
+
+@INPROCEEDINGS{Bengio89c,
+    author = {Bengio, Yoshua and Cosi, Piero and Cardin, Regis and De Mori, Renato},
+     title = {Use of multi-layered networks for coding speech with phonetic features},
+      year = {1989},
+     pages = {224--231},
+  crossref = {NIPS1-shorter},
+  abstract = {Preliminary results on speaker-independant speech recognition are reported. A method that combines expertise on neural networks with expertise on speech recognition is used to build the recognition systems. For transient sounds, event-driven property extractors with variable resolution in the time and frequency domains are used. For sonorant speech, a model of the human auditory system is preferred to FFT as a front-end module.},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio89d,
+     author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
+      title = {On the generalization capability of multilayered networks in the extraction of speech properties},
+  booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence},
+       year = {1989},
+      pages = {1531--1536},
+  publisher = {IEEE},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio90,
+    author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato},
+     title = {Speaker Independent Speech Recognition with Neural Networks and Speech Knowledge},
+      year = {1990},
+     pages = {218--225},
+  crossref = {NIPS2-shorter},
+  abstract = {We attempt to combine neural networks with knowledge from speech science to build a speaker independent speech recognition system. This knowledge is utilized in designing the preprocessing, input coding, output coding, output supervision and architectural constraints. To handle the temporal aspect of speech we combine delays, copies of activations of hidden and output units at the input level, and Back-Propagation for Sequences (BPS), a learning algorithm for networks with local self-loops. This strategy is demonstrated in several experiments, in particular a nasal discrimination task for which the application of a speech theory hypothesis dramatically improved generalization.},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@INCOLLECTION{Bengio90b,
+     author = {Bengio, Yoshua},
+      title = {Radial Basis Functions for speech recognition},
+  booktitle = {Speech Recognition and Understanding: Recent Advances, Trends and Applications},
+       year = {1990},
+      pages = {293--298},
+  publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
+topics={Kernel,Speech},cat={B},
+}
+
+@INCOLLECTION{Bengio90c,
+     author = {Bengio, Yoshua and De Mori, Renato},
+     editor = {{Fogelman Soulie}, F. and Herault, J.},
+      title = {Speech coding with multilayer networks},
+  booktitle = {Neurocomputing: Algorithms, Architectures and Applications},
+       year = {1990},
+      pages = {207--216},
+  publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences},
+topics={Speech},cat={B},
+}
+
+@INPROCEEDINGS{Bengio90e,
+    author = {Bengio, Yoshua and Pouliot, Yannick and Bengio, Samy and Agin, Patrick},
+     title = {A neural network to detect homologies in proteins},
+      year = {1990},
+     pages = {423--430},
+  crossref = {NIPS2-shorter},
+  abstract = {In order to detect the presence and location of immunoglobulin (Ig) domains from amino acid sequences we built a system based on a neural network with one hidden layer trained with back propagation. The program was designed to efficiently identify proteins exhibiting such domains, characterized by a few localized conserved regions and a low overall homology. When the National Biomedical Research Foundation (NBRF) NEW protein sequence database was scanned to evaluate the program's performance, we obtained very low rates of false negatives coupled with a moderate rate of false positives.},
+topics={Bioinformatic,PriorKnowledge},cat={C},
+}
+
+@INPROCEEDINGS{Bengio90z,
+     author = {Bengio, Yoshua and De Mori, Renato and Gori, Marco},
+     editor = {Caianello, E.},
+      title = {Experiments on automatic speech recognition using BPS},
+  booktitle = {Parallel Architectures and Neural Networks},
+       year = {1990},
+      pages = {223--232},
+  publisher = {World Scientific Publ.},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio91a,
+     author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+      title = {A comparative study of hybrid acoustic phonetic decoders based on artificial neural networks},
+  booktitle = {Proceedings of EuroSpeech'91},
+       year = {1991},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio91b,
+     author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+      title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
+  booktitle = {Proceedings of EuroSpeech'91},
+       year = {1991},
+topics={Markov},cat={C},
+}
+
+@INPROCEEDINGS{Bengio91z,
+     author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+      title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
+  booktitle = {Proceedings of EuroSpeech'91},
+       year = {1991},
+cat={C},
+}
+
+@ARTICLE{Bengio92b,
+    author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+     title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks},
+   journal = {Speech Communication},
+    volume = {11},
+    number = {2--3},
+      year = {1992},
+     pages = {261--271},
+      note = {Special issue on neurospeech},
+topics={PriorKnowledge,Speech},cat={J},
+}
+
+@INPROCEEDINGS{Bengio92c,
+    author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+     title = {Neural Network - Gaussian Mixture Hybrid for Speech Recognition or Density Estimation},
+      year = {1992},
+     pages = {175--182},
+  crossref = {NIPS4-shorter},
+  abstract = {The subject of this paper is the integration of multi-layered Artificial Neural Networks ({ANN}) with probability density functions such as Gaussian mixtures found in continuous density hlidden {Markov} Models ({HMM}). In the first part of this paper we present an {ANN}/HMM hybrid in which all the parameters or the the system are simultaneously optimized with respect to a single criterion. In the second part of this paper, we study the relationship between the density of the inputs of the network and the density of the outputs of the networks. A rew experiments are presented to explore how to perform density estimation with {ANN}s.},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio94d,
+     author = {Frasconi, Paolo and Bengio, Yoshua},
+      title = {An {EM} Approach to Grammatical Inference: Input/Output {HMMs}},
+  booktitle = {International Conference on Pattern Recognition (ICPR'94)},
+       year = {1994},
+      pages = {289--294},
+topics={Markov,LongTerm},cat={C},
+}
+
+@ARTICLE{Bengio96,
+    author = {Bengio, Yoshua and Frasconi, Paolo},
+     title = {Input/{O}utput {HMM}s for Sequence Processing},
+   journal = {IEEE Transactions on Neural Networks},
+    volume = {7},
+    number = {5},
+      year = {1996},
+     pages = {1231--1249},
+  abstract = {We consider problems of sequence processing and propose a solution based on a discrete state model in order to represent past context. We introduce a recurrent connectionist architecture having a modular structure that associates a subnetwork to each state. The model has a statistical interpretation we call Input/Output Hidden {Markov} Model ({IOHMM}). It can be trained by the {EM} or {GEM} algorithms, considering state trajectories as missing data, which decouples temporal credit assignment and actual parameter estimation.
+The model presents similarities to hidden {Markov} models ({HMM}s), but allows us to map input sequences to output sequences, using the same processing style as recurrent neural networks. {IOHMM}s are trained using a more discriminant learning paradigm than {HMM}s, while potentially taking advantage of the {EM} algorithm.
+We demonstrate that {IOHMM}s are well suited for solving grammatical inference problems on a benchmark problem. Experimental results are presented for the seven Tomita grammars, showing that these adaptive models can attain excellent generalization.},
+topics={Markov},cat={J},
+}
+
+@TECHREPORT{Bengio96-hmmsTR,
+       author = {Bengio, Yoshua},
+        title = {Markovian Models for Sequential Data},
+       number = {1049},
+         year = {1996},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hmmsTR.pdf},
+     abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many applications, especially for speech recognition. We first summarize the basics of {HMM}s, and then review several recent related learning algorithms and extensions of {HMM}s, including hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s, weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this area.},
+topics={Markov},cat={T},
+}
+
+@ARTICLE{Bengio97,
+    author = {Bengio, Yoshua},
+     title = {Using a Financial Training Criterion Rather than a Prediction Criterion},
+   journal = {International Journal of Neural Systems},
+    volume = {8},
+    number = {4},
+      year = {1997},
+     pages = {433--443},
+      note = {Special issue on noisy time-series},
+  abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a prediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to maximize the financial criterion of interest, here gains and losses (including those due to transactions) incurred during trading. Experiments were performed on portfolio selection with 35 Canadian stocks.},
+topics={Finance,PriorKnowledge,Discriminant},cat={J},
+}
+
+@ARTICLE{Bengio99a,
+    author = {Bengio, Yoshua},
+     title = {Markovian Models for Sequential Data},
+   journal = {Neural Computing Surveys},
+    volume = {2},
+      year = {1999},
+     pages = {129--162},
+  abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many machine learning applications, especially for speech recognition. Furthermore? in the last few years, many new and promising probabilistic models related to {HMM}s have been proposed. We first summarize the basics of {HMM}s, arid then review several recent related learning algorithms and extensions of {HMM}s, including in particular hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s (which are conditional {HMM}s using neural networks to compute probabilities), weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this very active area.},
+topics={Markov},cat={J},
+}
+
+@ARTICLE{Bengio99b,
+    author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
+     title = {Stochastic Learning of Strategic Equilibria for Auctions},
+   journal = {Neural Computation},
+    volume = {11},
+    number = {5},
+      year = {1999},
+     pages = {1199--1209},
+  abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
+topics={Auction},cat={J},
+}
+
+@TECHREPORT{bengio:1990,
+       author = {Bengio, Yoshua},
+        title = {Learning a Synaptic Learning Rule},
+       number = {751},
+         year = {1990},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+topics={BioRules},cat={T},
+}
+
+@INPROCEEDINGS{bengio:1990:snowbird,
+     author = {Bengio, Yoshua and R., De Mori},
+      title = {Recurrent networks with Radial Basis Functions for speech recognition},
+  booktitle = {1990 Neural Networks for Computing Conference},
+       year = {1990},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1991:ijcnn,
+     author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
+      title = {Learning a Synaptic Learning Rule},
+  booktitle = {Proceedings of the International Joint Conference on Neural Networks},
+       year = {1991},
+      pages = {II--A969},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1991_ijcnn.ps},
+   abstract = {This paper presents an original approach to neural modeling based on the idea of searching, with learning methods, for a synaptic learning rule which is biologically plausible, and yields networks that are able to learn to perform difficult tasks. The proposed method of automatically finding the learning rule relies on the idea of considering the synaptic modification rule as a parametric function. This function has local inputs and is the same in many neurons. The parameters that define this function can be estimated with known learning methods. For this optimization, we give particular attention to gradient descent and genetic algorithms. In both cases, estimation of this function consists of a joint global optimization of (a) the synaptic modification function, and (b) the networks that are learning to perform some tasks. The proposed methodology can be used as a tool to explore the missing pieces of the puzzle of neural networks learning. Both network architecture, and the learning function can be designed within constraints derived from biological knowledge.},
+addressfr={Seattle, USA},topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1991:nnc,
+     author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
+      title = {Learning Synaptic Learning Rules},
+  booktitle = {Neural Networks for Computing},
+       year = {1991},
+addressfr={Snowbird, Utah, USA},topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1991:snowbird,
+     author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn},
+      title = {Learning a Synaptic Learning Rule},
+  booktitle = {1991 Neural Networks for Computing Conference},
+       year = {1991},
+topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1992:nn,
+     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
+      title = {Aspects th{\'{e}}oriques de l'optimisation d'une r{\`{e}}gle d'apprentissage},
+  booktitle = {Actes de la conf{\'{e}}rence Neuro-N{\^{\i}}mes 1992},
+       year = {1992},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_nn.ps},
+   abstract = {Ayant expos{\'{e}} dans de pr{\'{e}}c{\'{e}}dentes publications (voir [Beng90, Beng92] notamment) l’id{\'{e}}e que l’on pouvait optimiser des r{\`{e}}gles d’apprentissage param{\'{e}}triques pour r{\'{e}}seaux de neurones, nous montrons dans cet article comment d{\'{e}}velopper, par la m{\'{e}}thode du Lagrangien, le gradient n{\'{e}}cessaire {\`{a}} l’optimisation d’une r{\`{e}}gle d’apprentissage par descente du gradient. Nous pr{\'{e}}sentons aussi les bases th{\'{e}}oriques qui permettent d’{\'{e}}tudier la g{\'{e}}n{\'{e}}ralisation {\`{a}} de nouvelles t{\^{a}}ches d’une r{\`{e}}gle d’apprentissage dont les param{\`{e}}tres ont {\'{e}}t{\'{e}} estim{\'{e}}s {\`{a}} partir d’un certain ensemble de t{\^{a}}ches. Enfin, nous exposons bri{\`{e}}vement les r{\'{e}}sultats d’une exp{\'{e}}rience consistant {\`{a}} trouver, par descente du gradient, une r{\`{e}}gle d’apprentissage pouvant r{\'{e}}soudre plusieurs t{\^{a}}ches bool{\'{e}}ennes lin{\'{e}}airement et non lin{\'{e}}airement s{\'{e}}parables.},
+addressfr={N{\^i}es, France},topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1992:oban,
+     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
+      title = {On the Optimization of a Synaptic Learning rule},
+  booktitle = {Conference on Optimality in Biological and Artificial Networks},
+       year = {1992},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_oban.ps},
+   abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia). Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
+addressfr={Dallas, USA},topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1992:snowbird,
+     author = {Bengio, Yoshua},
+      title = {Representations Based on Articulatory Dynamics for Speech Recognition},
+  booktitle = {1992 Neural Networks for Computing Conference},
+       year = {1992},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1993:icann,
+     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
+     editor = {Gielen, S. and Kappen, B.},
+      title = {Generalization of a Parametric Learning Rule},
+  booktitle = {{ICANN} '93: Proceedings of the International Conference on Artificial Neural Networks},
+       year = {1993},
+      pages = {502},
+  publisher = {Springer-Verlag},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1993_icann.ps},
+   abstract = {In previous work ([4,2,1])	 we discussed the subject of parametric learning rules for neural networks. In this article, we present a theoretical basis permitting to study the generalization property of a learning rule whose parameters are estimated from a set of learning tasks. By generalization, we mean the possibility of using the learning rule to learn solve new tasks. Finally, we describe simple experiments on two-dimensional categorization tasks and show how they corroborate the theoretical results.},
+addressfr={Amsterdam, Pays-Bas},topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1993:snowbird,
+     author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
+      title = {The Problem of Learning Long-Term Dependencies in Recurrent Networks},
+  booktitle = {1993 Neural Networks for Computing Conference},
+       year = {1993},
+topics={LongTerm},cat={C},
+}
+
+@TECHREPORT{bengio:1994,
+       author = {Bengio, Yoshua and Frasconi, Paolo},
+        title = {An {EM} Approach to Learning Sequential Behavior},
+       number = {DSI 11-94},
+         year = {1994},
+  institution = {Universita di Firenze, Dipartimento di Sistemi e Informatica},
+topics={LongTerm},cat={T},
+}
+
+@INPROCEEDINGS{bengio:1994:acfas,
+     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
+      title = {Optimisation d'une r{\`{e}}gle d'apprentissage pour r{\'{e}}seaux de neurones artificiels},
+  booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
+       year = {1994},
+topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1994:snowbird,
+     author = {Bengio, Yoshua and Frasconi, Paolo},
+      title = {An {EM} Algorithm for Target Propagation},
+  booktitle = {1994 Neural Networks for Computing Conference},
+       year = {1994},
+topics={LongTerm},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1994:wcci,
+     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
+      title = {Use of Genetic Programming for the Search of a New Learning Rule for Neural Networks},
+  booktitle = {Proceedings of the First Conference on Evolutionary Computation, {IEEE} World Congress on Computational Intelligence},
+       year = {1994},
+      pages = {324--327},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1994_wcci.ps},
+   abstract = {In previous work ([1,2,3]), we explained how to use standard optimization methods such as simulated annealing, gradient descent and genetic algorithms to optimize a parametric function which could be used as a learning rule for neural networks. To use these methods, we had to choose a fixed number of parameters and a rigid form for the learning rule. In this article, we propose to use genetic programming to find not only the values of rule parameters but also the optimal number of parameters and the form of the rule. Experiments on classification tasks suggest genetic programming finds better learning rules than other optimization methods. Furthermore, the best rule found with genetic programming outperformed the well-known backpropagation algorithm for a given set of tasks.},
+topics={BioRules},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1994b:acfas,
+     author = {Bengio, Yoshua and Frasconi, Paolo},
+      title = {R{\'{e}}seaux de neurones {M}arkoviens pour l'inf{\'{e}}rence grammaticale},
+  booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
+       year = {1994},
+topics={Markov,Language},cat={C},
+}
+
+@ARTICLE{bengio:1995:npl,
+    author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn},
+     title = {On the Search for New Learning Rules for {ANN}s},
+   journal = {Neural Processing Letters},
+    volume = {2},
+    number = {4},
+      year = {1995},
+     pages = {26--30},
+  abstract = {In this paper, we present a framework where a learning rule can be optimized within a parametric learning rule space. We define what we call parametric learning rules and present a theoretical study of their generalization properties when estimated from a set of learning tasks and tested over another set of tasks. We corroborate the results of this study with practical experiments.},
+topics={BioRules},cat={J},
+}
+
+@INCOLLECTION{bengio:1995:oban,
+     author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan},
+     editor = {Levine, D. S. and Elsberry, W. R.},
+      title = {{O}n the Optimization of a Synaptic Learning Rule},
+  booktitle = {Optimality in Biological and Artificial Networks},
+       year = {1995},
+  publisher = {Lawrence Erlbaum},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1995_oban.pdf},
+   abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia), Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.},
+topics={BioRules},cat={B},
+}
+
+@TECHREPORT{bengio:1996:udem,
+       author = {Bengio, Yoshua and Bengio, Samy},
+        title = {Training Asynchronous Input/Output Hidden {M}arkov Models},
+       number = {1013},
+         year = {1996},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1996_udem.ps},
+topics={Markov},cat={T},
+}
+
+@INPROCEEDINGS{bengio:1997:snowbird,
+     author = {Bengio, Yoshua and Bengio, Samy and Singer, Yoram and Isabelle, Jean-Fran{\c c}ois},
+      title = {On the Clusterization of Probabilistic Transducers},
+  booktitle = {1997 Neural Networks for Computing Conference},
+       year = {1997},
+topics={HighDimensional},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1998:snowbird,
+     author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
+      title = {Stochastic Learning of Strategic Equilibria for Auctions},
+  booktitle = {Learning Conference},
+       year = {1998},
+topics={Auction},cat={C},
+}
+
+@TECHREPORT{bengio:1998:udem,
+       author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles},
+        title = {Stochastic Learning of Strategic Equilibria for Auctions},
+       number = {1119},
+         year = {1998},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1998_udem.pdf},
+     abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.},
+topics={Auction},cat={T},
+}
+
+@INPROCEEDINGS{bengio:1999:snowbird,
+     author = {Bengio, Yoshua and Latendresse, Simon and Dugas, Charles},
+      title = {Gradient-Based Learning of Hyper-Parameters},
+  booktitle = {Learning Conference},
+       year = {1999},
+topics={ModelSelection},cat={C},
+}
+
+@INPROCEEDINGS{bengio:1999:titration,
+     author = {Bengio, Yoshua and Brault, J-J. and Major, Fran{\c c}ois and Neal, R. and Pigeon, Steven},
+      title = {Learning Algorithms for Sorting Compounds from Titration Curves},
+  booktitle = {Symposium on New Perspectives for Computer-Aided Drug Design},
+       year = {1999},
+topics={Speech},cat={C},
+}
+
+@ARTICLE{bengio:2000:ieeetrnn,
+    author = {Bengio, Samy and Bengio, Yoshua},
+     title = {Taking on the Curse of Dimensionality in Joint Distributions Using Neural Networks},
+   journal = {IEEE Transaction on Neural Networks special issue on data mining and knowledge discovery},
+    volume = {11},
+    number = {3},
+      year = {2000},
+     pages = {550--557},
+  abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow at most as the square of the number of variables, using a multi_layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables (thus reducing significantly the number of parameters). Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
+topics={HighDimensional,Unsupervised,Mining},cat={J},
+}
+
+@INPROCEEDINGS{bengio:2000:nips,
+    author = {Bengio, Yoshua and Bengio, Samy},
+     title = {Modeling High-Dimensional Discrete Data with Multi-Layer Neural Networks},
+      year = {2000},
+     pages = {400--406},
+  crossref = {NIPS12-shorter},
+  abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow only at most as the square of the number of variables, using a multi-layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables. Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.},
+topics={HighDimensional,Unsupervised},cat={C},
+}
+
+@ARTICLE{bengio:2003,
+    author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
+     title = {A Neural Probabilistic Language Model},
+    volume = {3},
+      year = {2003},
+     pages = {1137--1155},
+  crossref = {JMLR-shorter},
+  abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
+topics={Markov,Unsupervised,Language},cat={J},
+}
+
+@TECHREPORT{bengio:socs-1990,
+       author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+        title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid},
+       number = {TR-SOCS-90.22},
+         year = {1990},
+  institution = {School of Computer Science, McGill University},
+topics={Markov},cat={T},
+}
+
+@INPROCEEDINGS{bengioc:1994:acfas,
+     author = {Bengio, Yoshua and {LeCun}, Yann},
+      title = {Reconnaissance de mots manuscrits avec r{\'{e}}seaux de neurones et mod{\`{e}}les de {M}arkov},
+  booktitle = {Actes du soixante-deuxi{\`{e}}me congr{\`{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels},
+       year = {1994},
+topics={Markov,Speech},cat={C},
+}
+
+@TECHREPORT{Bengio_Bottou92,
+       author = {Bengio, Yoshua and Bottou, {L{\'{e}}on}},
+        title = {A New Approach to Estimating Probability Density Functions with Artificial Neural Networks},
+       number = {TR-92.02},
+         year = {1992},
+  institution = {Massachusetts Institute of Technology, Dept. Brain and Cognitive Sciences},
+topics={HighDimensional},cat={T},
+}
+
+@INCOLLECTION{bengio_extension_nips_2003,
+    author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Delalleau, Olivier and Le Roux, Nicolas and Ouimet, Marie},
+  keywords = {dimensionality reduction, eigenfunctions learning, Isomap, kernel {PCA}, locally linear embedding, Nystrom formula, spectral methods},
+     title = {Out-of-Sample Extensions for {LLE}, Isomap, {MDS}, Eigenmaps, and Spectral Clustering},
+      year = {2004},
+  crossref = {NIPS16-shorter},
+  abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides a unified framework for extending Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (for dimensionality reduction) as well as for Spectral Clustering. This framework is based on seeing these algorithms as learning eigenfunctions of a data-dependent kernel. Numerical experiments show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms due to the choice of training data.},
+topics={HighDimensional,Kernel,Unsupervised},cat={C},
+}
+
+@ARTICLE{Bengio_Gingras98a,
+    author = {Bengio, Yoshua and Gingras, Fran{\c c}ois and Goulard, Bernard and Lina, Jean-Marc},
+     title = {Gaussian Mixture Densities for Classification of Nuclear Power Plant Data},
+   journal = {Computers and Artificial Intelligence},
+    volume = {17},
+    number = {2-3},
+      year = {1998},
+     pages = {189--209},
+  abstract = {In this paper we are concerned with the application of learning algorithms to the classification of reactor states in nuclear plants. Two aspects must be considered, (1) some types of events (e.g., abnormal or rare) will not appear in the data set, but the system should be able to detect them, (2) not only classification of signals but also their interpretation are important for nuclear plant monitoring. We address both issues with a mixture of mixtures of Gaussians in which some parameters are shared to reflect the similar signals observed in different states of the reactor. An {EM} algorithm for these shared Gaussian mixtures is presented. Experimental results on nuclear plant data demonstrate the advantages of the proposed approach with respect to the above two points.},
+topics={Mining},cat={J},
+}
+
+@ARTICLE{Bengio_Gingras98b,
+    author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
+     title = {Handling Asynchronous or Missing Financial Data with Recurrent Networks},
+   journal = {International Journal of Computational Intelligence and Organizations},
+    volume = {1},
+    number = {3},
+      year = {1998},
+     pages = {154--163},
+  abstract = {An important issue with many sequential data analysis problems, such as those encountered in financial data sets, is that different variables are known at different frequencies, at different times (asynchronicity), or are sometimes missing. To address this issue we propose to use recurrent networks with feedback into the input units, based on two fundamental ideas. The first motivation is that the “filled-in” value of the missing variable may not only depend in complicated ways on the value of this variable in the past of the sequence but also on the current and past values of other variables. The second motivation is that, for the purpose of making predictions or taking decisions, it is not always necessary to fill in the best possible value of the missing variables. In fact, it is sufficient to fill in a value which helps the system make better predictions or decisions. The advantages of this approach are demonstrated through experiments on several tasks.},
+topics={Finance,Missing},cat={J},
+}
+
+@INPROCEEDINGS{Bengio_icassp90,
+     author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Normandin, Yves},
+      title = {A Hybrid Coder for Hidden {M}arkov Models Using a Recurrent Neural Network},
+  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
+       year = {1990},
+      pages = {537--540},
+topics={Markov,Speech},cat={C},
+}
+
+@INPROCEEDINGS{Bengio_LeCun94,
+    author = {Bengio, Yoshua and {LeCun}, Yann and Henderson, Donnie},
+     title = {Globally Trained Handwritten Word Recognizer using Spatial Representation, Space Displacement Neural Networks and Hidden {M}arkov Models},
+      year = {1994},
+     pages = {937--944},
+  crossref = {NIPS6-shorter},
+  abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
+topics={Speech},cat={C},
+}
+
+@ARTICLE{Bengio_LeCun95,
+    author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris},
+     title = {LeRec: A {NN}/{HMM} Hybrid for On-Line Handwriting Recognition},
+   journal = {Neural Computation},
+    volume = {7},
+    number = {6},
+      year = {1995},
+     pages = {1289--1303},
+  abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm.  Words are then coded into low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
+topics={PriorKnowledge,Speech},cat={J},
+}
+
+@ARTICLE{Bengio_prel92,
+    author = {Bengio, Yoshua and Gori, Marco and De Mori, Renato},
+     title = {Learning the Dynamic Nature of Speech with Back-propagation for Sequences},
+   journal = {Pattern Recognition Letters},
+    volume = {13},
+    number = {5},
+      year = {1992},
+     pages = {375--385},
+      note = {(Special issue on Artificial Neural Networks)},
+topics={Speech},cat={J},
+}
+
+@ARTICLE{Bengio_trnn92,
+    author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf},
+     title = {Global Optimization of a Neural Network-Hidden {M}arkov Model Hybrid},
+   journal = {IEEE Transactions on Neural Networks},
+    volume = {3},
+    number = {2},
+      year = {1992},
+     pages = {252--259},
+topics={Markov},cat={J},
+}
+
+@TECHREPORT{Bergstra+2009,
+       author = {Bergstra, James and Desjardins, Guillaume and Lamblin, Pascal and Bengio, Yoshua},
+        title = {Quadratic Polynomials Learn Better Image Features},
+       number = {1337},
+         year = {2009},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {The affine-sigmoidal hidden unit (of the form $\sigma(ax+b)$) 
+    is a crude predictor of neuron response in visual area V1.
+    More descriptive models of V1 have been advanced that are no more computationally expensive,
+    yet artificial neural network research continues to focus on networks of affine-sigmoidal models.
+    This paper identifies two qualitative differences between the affine-sigmoidal hidden unit
+    and a particular recent model of V1 response:
+    a) the presence of a low-rank quadratic term in the argument to $\sigma$, 
+    and b) the use of a gentler non-linearity than the $\tanh$ or logistic sigmoid.
+    We evaluate these model ingredients by training single-layer 
+    neural networks to solve three image classification tasks.
+    We experimented with fully-connected hidden units,
+    as well as locally-connected units and convolutional units
+    that more closely mimic the function and connectivity of the visual system.
+    On all three tasks, both the quadratic interactions and the gentler non-linearity
+    lead to significantly better generalization.
+    The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.}
+}
+
+@MISC{bergstra+al:2010-scipy,
+        author = {Bergstra, James},
+         title = {Optimized Symbolic Expressions and {GPU} Metaprogramming with Theano},
+          year = {2010},
+  howpublished = {{SciPy}},
+          note = {Oral}
+}
+
+@MISC{bergstra+al:2010-sharcnet,
+        author = {Bergstra, James and Bengio, Yoshua},
+         title = {{GPU} Programming with Theano},
+          year = {2010},
+  howpublished = {{SHARCNET} Research Day},
+          note = {Oral}
+}
+
+@MISC{bergstra+al:2010snowbird,
+     author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Turian, Joseph and Desjardins, Guillaume and Pascanu, Razvan and Erhan, Dumitru and Delalleau, Olivier and Bengio, Yoshua},
+      title = {Deep Learning on {GPU}s with Theano},
+  booktitle = {The Learning Workshop},
+       year = {2010},
+       note = {Oral}
+}
+
+@INPROCEEDINGS{Bergstra+Bengio-2009,
+    author = {Bergstra, James and Bengio, Yoshua},
+     title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
+      year = {2009},
+  crossref = {NIPS22}
+}
+
+@ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006,
+    author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
+     title = {Aggregate Features and AdaBoost for Music Classification},
+   journal = {Machine Learning},
+    volume = {65},
+      year = {2006},
+     pages = {473--484},
+      issn = {0885-6125},
+  abstract = {We present an algorithm that predicts musical genre and artist from an audio waveform. Our method uses the ensemble learner ADABOOST to select from a set of audio features that have been extracted from segmented audio and then aggregated. Our classifier proved to be the most effective method for genre classification at the recent MIREX 2005 international contests in music information extraction, and the second-best method for recognizing artists. This paper describes our method in detail, from feature extraction to song classification, and presents an evaluation of our method on three genre databases and two artist-recognition databases. Furthermore, we present evidence collected from a variety of popular features and classifiers that the technique of classifying features aggregated over segments of audio is better than classifying either entire songs or individual short-timescale features.},
+PDF = {papers/2006_ml_draft.pdf},
+  SOURCE = {OwnPublication},
+}
+
+@INPROCEEDINGS{bergstra+lacoste+eck:2006,
+     author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas},
+      title = {Predicting Genre Labels for Artists using FreeDB},
+  booktitle = {Proc. 7th International Conference on Music Information Retrieval (ISMIR)},
+       year = {2006},
+SOURCE = {OwnPublication},
+  PDF = {papers/2006_ismir_freedb.pdf},
+}
+
+@INPROCEEDINGS{bergstra+mandel+eck:2010,
+     author = {Bergstra, James and Mandel, Michael and Eck, Douglas},
+      title = {Scalable Genre and Tag Prediction with Spectral Covariance},
+  booktitle = {{ISMIR}},
+       year = {2010},
+       note = {accepted}
+}
+
+@MASTERSTHESIS{Bergstra-Msc-2006,
+    author = {Bergstra, James},
+  keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale},
+     title = {Algorithms for Classifying Recorded Music by Genre},
+      year = {2006},
+    school = {Universit{\'{e}} de Montreal},
+  abstract = {Ce m{\'{e}}moire traite le probl{\`{e}}me de la classification automatique de signaux musicaux par genre. Dans un premier temps, je pr{\'{e}}sente une technique utilisant l'apprentissage machine pour classifier des statistiques extraites sur des segments du signal sonore. Malgr{\'{e}} le fait que cette technique a d{\'{e}}j{\`{a}} {\'{e}}t{\'{e}} explor{\'{e}}e, mon m{\'{e}}moire est le premier {\`{a}} investiguer l'influence de la longueur et de la quantit{\'{e}} de ces segments sur le taux de classification. J'explore {\'{e}}galement l'importance d'avoir des segments contigus dans le temps. Les segments d'une {\`{a}} trois secondes apportent une meilleure performance, mais pour ce faire, ils doivent {\^{e}}tre suffisamment nombreux. Il peut m{\^{e}}me {\^{e}}tre utile d'augmenter la quantit{\'{e}} de segments jusqu'{\`{a}} ce qu'ils se chevauchent. Dans les m{\^{e}}mes exp{\'{e}}riences, je pr{\'{e}}sente une formulation alternative des descripteurs d'audio nomm{\'{e}}e Melfrequency Cepstral Coefficient (MFCC) qui am{\`{e}}ne un taux de classification de 81 \% sur un jeux de donn{\'{e}}es pour lequel la meilleure performance publi{\'{e}}e est de 71 \%. Cette m{\'{e}}thode de segmentation des chansons, ainsi que cette formulation alternative, ont pour but d'am{\'{e}}liorer l'algorithme gagnant du concours de classification de genre de MIREX 2005, d{\'{e}}velopp{\'{e}} par Norman Casagrande et moi. Ces exp{\'{e}}riences sont un approfondissement du travail entam{\'{e}} par Bergstra et al. [2006a], qui d{\'{e}}crit l'algorithme gagnant de ce concours.
+Dans un deuxi{\`{e}}me temps, je pr{\'{e}}sent une m{\'{e}}thode qui utilise FreeDB, une base de donn{\'{e}}es d'information sur les albums, pour attribuer {\`{a}} un artiste une distribution de probabilit{\'{e}} sur son genre. Avec une petite base de donn{\'{e}}es, faite {\`{a}} la main, je montre qu'il y a une haute corr{\'{e}}lation entre cette distribution et l'{\'{e}}tiquette de genre traditionnel. Bien qu'il reste {\`{a}} d{\'{e}}montrer que cette m{\'{e}}thode est utile pour organiser une collection de musique, ce r{\'{e}}sultat sugg{\`{e}}re qu'on peut maintenant {\'{e}}tiqueter de grandes bases de musique automatiquement {\`{a}} un faible co{\^{u}}t, et par cons{\'{e}}quent de poursuivre plus facilement la recherche en classification {\`{a}} grande {\'{e}}chelle. Ce travail sera publi{\'{e}} comme Bergstra et al. [2006b] {\`{a}} ISMIR 2006.}
+}
+
+@INPROCEEDINGS{bergstra:2010cosyne,
+     author = {Bergstra, James and Bengio, Yoshua and Lamblin, Pascal and Desjardins, Guillaume and Louradour, Jerome},
+      title = {Image classification with complex cell neural networks},
+  booktitle = {Computational and systems neuroscience (COSYNE)},
+       year = {2010},
+       note = {Poster},
+        url = {http://www.frontiersin.org/conferences/individual_abstract_listing.php?conferid=770&pap=3626&ind_abs=1&pg=335},
+        doi = {10.3389/conf.fnins.2010.03.00334}
+}
+
+@INPROCEEDINGS{biaslearn:2000:ijcnn,
+     author = {Ghosn, Joumana and Bengio, Yoshua},
+      title = {Bias Learning, Knowledge Sharing},
+  booktitle = {International Joint Conference on Neural Networks 2000},
+     volume = {I},
+       year = {2000},
+      pages = {9--14},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/ijcnn_manifold.pdf},
+   abstract = {Biasing the hypothesis space of a learner has been shown to improve generalisation performances. Methods for achieving this goal have been proposed, that range from deriving and introducing a bias into a learner to automatically learning the bias. In the latter case, most methods learn the bias by simultaneously training several related tasks derived from the same domain and imposing constraints on their parameters. We extend some of the ideas presented in this field and describe a new model that parameterizes the parameters of each task as a function of an affine manifold defined in parameter space and a point lying on the manifold. An analysis of variance on a class of learning tasks is performed that shows some significantly improved performances when using the model.},
+topics={MultiTask},cat={C},
+}
+
+@ARTICLE{biaslearn:2003:tnn,
+    author = {Ghosn, Joumana and Bengio, Yoshua},
+     title = {Bias Learning, Knowledge Sharing},
+   journal = {IEEE Transaction on Neural Networks},
+    volume = {14},
+    number = {4},
+      year = {2003},
+     pages = {748--765},
+  abstract = {Biasing properly the hypothesis space of a learner has been shown to improve generalization performance. Methods for achieving this goal have been proposed, that range from designing and introducing a bias into a learner to automatically learning the bias. Multitask learning methods fall into the latter category. When several related tasks derived from the same domain are available, these methods use the domain-related knowledge coded in the training examples of all the tasks as a source of bias. We extend some of the ideas presented in this field and describe a new approach that identifies a family of hypotheses, represented by a manifold in hypothesis space, that embodies domain-related knowledge. This family is learned using training examples sampled from a group of related tasks. Learning models trained on these tasks are only allowed to select hypotheses that belong to the family. We show that the new approach encompasses a large variety of families which can be learned. A statistical analysis on a class of related tasks is performed that shows significantly improved performances when using this approach.},
+topics={MultiTask},cat={J},
+}
+
+@MASTERSTHESIS{Boisvert-Mcs-2005,
+    author = {Boisvert, Maryse},
+  keywords = {Algorithme {EM} , D{\'{e}}composition en valeurs singuli{\`{e}}res , D{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique , Mod{\`{e}}les graphiques, WordNet },
+     title = {R{\'{e}}duction de dimension pour mod{\`{e}}les graphiques probabilistes appliqu{\'{e}}s {\`{a}} la d{\'{e}}sambiguisation s{\'{e}}mantique},
+      year = {2005},
+    school = {Universit{\'{e}} de Montr{\'{e}}al}
+}
+
+@INPROCEEDINGS{bonneville98,
+     author = {Bonneville, Martin and Meunier, Jean and Bengio, Yoshua and Soucy, Jean-Paul},
+      title = {Support Vector Machines for Improving the classification of Brain Pet Images},
+  booktitle = {SPIE Medical Imaging},
+       year = {1998},
+topics={Kernel},cat={C},
+}
+
+@INPROCEEDINGS{Bottou+Bengio95,
+    author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua},
+     title = {Convergence Properties of the {K}-Means Algorithm},
+      year = {1995},
+     pages = {585--592},
+  crossref = {NIPS7-shorter},
+  abstract = {This paper studies the convergence properties of the well known K-Means clustering algorithm. The K-Means algorithm can be described either as a gradient descent algorithm or by slightly extending the mathematics of the {EM} algorithm to this hard threshold case. We show that the K-Means algorithm actually minimizes the quantization error using the very fast Newton algorithm.},
+topics={Unsupervised},cat={C},
+}
+
+@ARTICLE{bottou-98,
+    author = {Bottou, {L{\'{e}}on} and Haffner, Patrick and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
+     title = {High Quality Document Image Compression with {DjVu}},
+   journal = {Journal of Electronic Imaging},
+    volume = {7},
+    number = {3},
+      year = {1998},
+     pages = {410--425},
+topics={Compression},cat={J},
+}
+
+@INPROCEEDINGS{Bottou-dcc98,
+     author = {Bottou, {L{\'{e}}on} and G. Howard, Paul and Bengio, Yoshua},
+     editor = {Society, {IEEE} Computer},
+      title = {The Z-Coder Adaptive Binary Coder},
+  booktitle = {Data Compression Conference},
+       year = {1998},
+        url = {http://leon.bottou.org/papers/bottou-howard-bengio-98},
+topics={Compression},cat={C},
+}
+
+@INPROCEEDINGS{bottou-lecun-bengio-97,
+     author = {Bottou, {L{\'{e}}on} and {LeCun}, Yann and Bengio, Yoshua},
+      title = {Global Training of Document Processing Systems using Graph Transformer Networks},
+  booktitle = {Proc. of Computer Vision and Pattern Recognition},
+       year = {1997},
+      pages = {490--494},
+  publisher = {IEEE},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.ps.gz},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@TECHREPORT{bottou96TR,
+       author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann},
+        title = {Document analysis with transducers},
+       number = {Technical Memorandum HA615600-960701-01TM},
+         year = {1996},
+  institution = {AT\&T Labs},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/transducer-tm.ps.gz},
+topics={HighDimensional},cat={T},
+}
+
+@TECHREPORT{bottou97TR,
+       author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and G. Howard, Paul},
+        title = {Z-Coder: A Fast Adaptive Binary Arithmetic Coder},
+       number = {Technical Memorandum HA615600-970721-02TM},
+         year = {1997},
+  institution = {AT\&T Labs},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/zcoder-tm.ps.gz},
+topics={Compression},cat={T},
+}
+
+@MASTERSTHESIS{Bouchard-Msc-2007,
+    author = {Bouchard, Lysiane},
+  keywords = {auditory cortex, fMRI, linear classifier, logistic regression, na{\"{\i}}ve bayesian gaussian model, neuroimaging, spectro-temporal modulation, support vectors machine},
+     title = {Analyse par apprentissage automatique des r{\'{e}}ponses fMRI du cortex auditif {\`{a}} des modulations spectro-temporelles.},
+      year = {2009},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {The application of linear machine learning classifiers to the analysis of brain imaging data (fMRI) has led to several interesting breakthroughs in recent years. These classifiers combine the responses of the voxels to detect and categorize different brain states. They allow a more agnostic analysis than conventional fMRI analysis that systematically treats weak and distributed patterns as unwanted noise. In this project, we use such classifiers to validate an hypothesis concerning the encoding of sounds in the human brain. More precisely, we attempt to locate neurons tuned to spectral and temporal modulations in sound. We use fMRI recordings of brain responses of subjects listening to 49 different spectro-temporal modulations. The analysis of fMRI data through linear classifiers is not yet a standard procedure in this field. Thus, an important objective of this project, in the long term, is the development of new machine learning algorithms specialized for neuroimaging data. For these reasons, an important part of the experiments is dedicated to studying the behaviour of the classifiers. We are mainly interested in 3 standard linear classifiers, namely the support vectors machine algorithm (linear), the logistic regression algorithm (regularized) and the na{\"{\i}}ve bayesian gaussian model (shared variances).}
+}
+
+@PHDTHESIS{Boufaden-Phd-2005,
+    author = {Boufaden, Narj{\`{e}}s},
+     title = {Extraction d’information {\`{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es},
+      year = {2005},
+    school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel}
+}
+
+@INPROCEEDINGS{Carreau+Bengio-2007,
+     author = {Carreau, Julie and Bengio, Yoshua},
+      title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data},
+  booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
+       year = {2007},
+  publisher = {Omnipress},
+   abstract = {We propose an estimator for the conditional density p(Y|X) that can adapt for asymmetric heavy tails which might depend on X. Such estimators have important applications in finance and insurance. We draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail.  We use this hybrid in a multi-modal mixture in order to obtain a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional density estimator, the parameters of the mixture estimator can be seen as
+functions of X and these functions learned. We show experimentally that this approach better models the conditional density in terms of likelihood than compared competing algorithms : conditional mixture models with other types of components and multivariate nonparametric models.},
+date={21-24}
+}
+
+@ARTICLE{Carreau+Bengio-2009,
+    author = {Carreau, Julie and Bengio, Yoshua},
+     title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distributio\ n},
+   journal = {IEEE Transactions on Neural Networks},
+    volume = {20},
+    number = {7},
+      year = {2009},
+     pages = {1087--1101},
+      issn = {1045-9227},
+  abstract = {In many cases, we observe some variables X that contain predictive information over a scalar variable of interest Y, with (X,Y) pairs observed in a training set. We can take advantage of this information to estimate the conditional density P(Y\X = x). In this paper, we propose a conditional mixture model with hybrid {Pareto} components to estimate P(Y\X = x).The hybrid {Pareto} is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. A third parameter, in addition to the location and spread parameters of the Gaussian, controls the heaviness of the upper tail. Using the hybrid {Pareto} in a mixture model results in a nonparametric estimator that can adapt to multimodality, asymmetry, and heavy tails. A conditional density estimator is built by modeling the parameters of the mixture estimator as functions of X. We use a neural network to implement these functions. Such conditional density estimators have important applications in many domains such as finance and insurance. We show experimentally that this novel approach better models the conditional density in terms of likelihood, compared to competing algorithms: conditional mixture models with other types of components and a classical kernel-based nonparametric model.}
+}
+
+@ARTICLE{Carreau+Bengio-extreme-2009,
+    author = {Carreau, Julie and Bengio, Yoshua},
+     title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tailed Data: the univariate case},
+   journal = {Extremes},
+    volume = {12},
+    number = {1},
+      year = {2009},
+     pages = {53--76},
+  abstract = {Density estimators that can adapt to asymmetric heavy tails are required in many applications such as finance and insurance. Extreme Value Theory (EVT) has developped principled methods based on asymptotic results to estimate the tails of most distributions. However, the finite sample approximation might introduce a severe bias in many cases. Moreover, the full range of the distribution is often needed, not only the tail area. On the other hand, non-parametric methods, while being powerful where data are abundant, fail to extrapolate properly in the tail area. We put forward a non-parametric density estimator that brings together the strengths of non-parametric density estimation and of EVT.  A hybrid {Pareto} distribution that can be used in a mixture model is proposed to extend the generalized {Pareto} (GP) to the whole real axis. Experiments on simulated data show the following. On one hand, the mixture of hybrid {Pareto}s converges faster in terms of log-likelihood and provides good estimates of the tail of the distributions when compared with other density estimators including the GP distribution. On the other hand, the mixture of hybrid {Pareto}s offers an alternate way to estimate the tail index which is comparable to the one estimated with the standard GP methodology. The mixture of hybrids is also evaluated on the Danish fire insurance data set.}
+}
+
+@PHDTHESIS{Carreau-PhD-2007,
+    author = {Carreau, Julie},
+  keywords = {density estimation, extreme values, generalized {Pareto} distribution, heavy-tailed distribution, mixture of distributions, neural networks},
+     title = {Mod{\`{e}}les {Pareto} hybrides pour distributions asym{\'{e}}triques et {\`{a}} queues lourdes},
+      year = {2007},
+    school = {UdeM},
+  abstract = {We put forward a class of density estimators that can adapt to asymmetric, multi-modal and heavy-tailed distributions. Such distributions occur in many application domains such as finance and insurance. Mixture of gaussians are flexible non-parametric density estimators that have good approximation properties when the number of components is well chosen with respect to the training set size. However, those models are performing poorly on heavy-tailed data because few observations occur in the tail area. To solve this problem, we resort to extreme value theory where methods based on sound parametric assumptions have been developped to enable extrapolation beyond the range of the observations. More precisely, we build on the PoT method that was developped in hydrology where PoT stands for  "Peaks-over-Threshold". The observations exceeding a given threshold are modeled by the generalized {Pareto} distribution. This distribution can approximate arbitrarily well the tail of most distributions. We build a new distribution, the hybrid {Pareto}, by stitching together a truncated Normal and a generalized {Pareto} distribution. We impose continuity constraints at the junction point. The hybrid {Pareto} is thus a smooth distribution that can be used in a mixture model.  The behavior of the upper tail of the hybrid is similar to the behavior of the generalized {Pareto} tail. Moreover, the threshold inherent in the the PoT methodology can now be defined implicitly as the junction point of the component with the heaviest tail. This component also determines the tail index of the mixture. Hence, the hybrid {Pareto} mixture offers an alternate way to estimate the tail index associated with heavy-tailed data. In several applications, information that has predictive power on the variable of interest is available. In that case, we want to model the conditional density of Y given X, the vector containing predictive information. When the distribution of Y given X is asymmetric, multi-modal and heavy-tailed, we propose to use a mixure of hybrid {Pareto}s whose parameters are functions of X. Those functions are implemented by means of a neural network with one hidden layer. Neural neworks are non-parametric models that can, in principle, approximate any continuous function.  Experiments on artificial and real data sets show that the hybrid {Pareto} mixture, unconditional and conditional, outperforms other density estimators in terms of log-likelihood.}
+}
+
+@INPROCEEDINGS{casagrande+eck+kegl:icmc2005,
+     author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
+      title = {Geometry in Sound: A Speech/Music Audio Classifier Inspired by an Image Classifier},
+  booktitle = {{Proceedings of the International Computer Music Conference (ICMC)}},
+       year = {2005},
+      pages = {207--210},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_icmc_casagrande.pdf},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{casagrande+eck+kegl:ismir2005,
+     author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
+      title = {Frame-Level Audio Feature Extraction using {A}da{B}oost},
+  booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
+       year = {2005},
+      pages = {345--350},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir_casagrande.pdf},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@PROCEEDINGS{ccai2006,
+     editor = {Lamontagne, Luc and Marchand, Mario},
+      title = {Advances in Artificial Intelligence, 19th Conference of the Canadian Society for Computational Studies of Intelligence, Canadian AI 2006, Qu{\'{e}}bec City, Qu{\'{e}}bec, Canada, June 7-9, 2006, Proceedings},
+  booktitle = {Canadian Conference on AI},
+     series = {Lecture Notes in Computer Science},
+     volume = {4013},
+       year = {2006},
+  publisher = {Springer}
+}
+
+@INPROCEEDINGS{Chapados+Bengio-2006,
+     author = {Chapados, Nicolas and Bengio, Yoshua},
+      title = {The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization},
+  booktitle = {AI06},
+       year = {2006},
+      pages = {491-502}
+}
+
+@INPROCEEDINGS{Chapados+Bengio-2007,
+     author = {Chapados, Nicolas and Bengio, Yoshua},
+      title = {Forecasting Commodity Contract Spreads with Gaussian Process},
+  booktitle = {13th Intarnational Conference on Computing in Economics and Finance},
+       year = {2007},
+   abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
+}
+
+@ARTICLE{Chapados+Bengio-2008-JOC,
+    author = {Chapados, Nicolas and Bengio, Yoshua},
+     title = {Noisy K Best-Paths for Approximate Dynamic Programming with Application to Portfolio Optimization},
+   journal = {Journal of Computers},
+    volume = {2},
+    number = {1},
+      year = {2007},
+     pages = {12--19},
+  abstract = {We describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-bestpaths algorithm. We consider an application in financial portfolio management where we can train a controller to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating experimental results using a kernel-based controller architecture that would not normally be considered in traditional
+reinforcement learning or approximate dynamic programming.We further show that using a non-additive criterion (incremental Sharpe Ratio) yields a noisy K-best-paths extraction problem, that can give substantially improved performance.}
+}
+
+@MASTERSTHESIS{Chapados-Msc-2000,
+    author = {Chapados, Nicolas},
+     title = {Crit{\`{e}}res d'optimisation d'algorithmes d'apprentissage en gestion de portefeuille},
+      year = {2000},
+    school = {Universit{\'{e}} de Montr{\'{e}}al}
+}
+
+@INPROCEEDINGS{chapados2000,
+     author = {Chapados, Nicolas and Bengio, Yoshua},
+      title = {Cost Functions and Model Combination for {VaR}-Based Asset Allocation Using Neural Networks},
+  booktitle = {Computational Finance 2000},
+       year = {2000},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/compfin2000_final.pdf},
+   abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees using both paradigms are significantly outperforming the benchmark market performance.},
+topics={Finance},cat={C},
+}
+
+@ARTICLE{chapados:2001,
+    author = {Chapados, Nicolas and Bengio, Yoshua},
+     title = {Cost Functions and Model Combination for VaR--based Asset Allocation using Neural Networks},
+   journal = {IEEE Transactions on Neural Networks},
+    volume = {12},
+    number = {4},
+      year = {2001},
+     pages = {890--906},
+  abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we
+compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees
+using both paradigms are significantly outperforming the benchmark market performance.},
+topics={Finance},cat={J},
+}
+
+@ARTICLE{chapados:2003,
+    author = {Bengio, Yoshua and Chapados, Nicolas},
+     title = {Extensions to Metric-Based Model Selection},
+      year = {2003},
+  crossref = {JMLR-shorter},
+  abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
+topics={ModelSelection,Finance},cat={J},
+}
+
+@ARTICLE{chapelle:2001,
+    author = {Chapelle, Olivier and Vapnik, Vladimir and Bengio, Yoshua},
+     title = {Model Selection for Small Sample Regression},
+   journal = {Machine Learning},
+      year = {2001},
+  abstract = {Model selection is an important ingredient of many machine learning algorithms, in particular when the sample size in small, in order to strike the right trade-off between overfitting and underfitting. Previous classical results for linear regression are based on an asymptotic analysis. We present a new penalization method for performing model selection for regression that is appropriate even for small samples. Our penalization is based on an accurate estimator of the ratio of the expected training error and the expected generalization error, in terms of the expected eigenvalues of the input covariance matrix.},
+topics={ModelSelection},cat={J},
+}
+
+@INCOLLECTION{chapter-eval-longterm-2001,
+     author = {Schmidhuber, Juergen and Hochreiter, Sepp and Bengio, Yoshua},
+     editor = {Kolen, J. and Kremer, S.},
+      title = {Evaluating Benchmark Problems by Random Guessing},
+  booktitle = {Field Guide to Dynamical Recurrent Networks},
+       year = {2001},
+  publisher = {IEEE Press},
+topics={LongTerm},cat={B},
+}
+
+@INCOLLECTION{chapter-gradient-document-2001,
+     author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
+     editor = {Haykin, S. and Kosko, B.},
+      title = {Gradient-Based Learning Applied to Document Recognition},
+  booktitle = {Intelligent Signal Processing},
+       year = {2001},
+      pages = {306--351},
+  publisher = {IEEE Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-01a.pdf},
+   abstract = {Multilayer Neural Networks trained with a backprppagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques.
+Real-life document recognition systems are composed of multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to monimize an overall peformance measure.
+Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks.
+A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with a global training technique to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
+topics={PriorKnowledge,Speech},cat={B},
+}
+
+@INCOLLECTION{chapter-gradient-flow-2001,
+     author = {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo},
+     editor = {Kolen, J. and Kremer, S.},
+      title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies},
+  booktitle = {Field Guide to Dynamical Recurrent Networks},
+       year = {2001},
+  publisher = {IEEE Press},
+topics={LongTerm},cat={B},
+}
+
+@INPROCEEDINGS{chemero+eck:1999,
+     author = {Chemero, T. and Eck, Douglas},
+      title = {An Exploration of Representational Complexity via Coupled Oscillators},
+  booktitle = {{Proceedings of the Tenth Midwest Artificial Intelligence and Cognitive Science Society}},
+       year = {1999},
+  publisher = {MIT Press},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/1999_chemero.pdf},
+   abstract = {We note some inconsistencies in a view of representation which takes {\it decoupling} to be of key importance. We explore these inconsistencies using examples of representational vehicles taken from coupled oscillator theory and suggest a new way to reconcile {\it coupling} with {\it absence}. Finally, we tie these views to a teleological definition of representation.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@ARTICLE{ChemInfModel2006,
+    author = {Erhan, Dumitru and {L'Heureux}, Pierre-Jean and Yue, Shi Yi and Bengio, Yoshua},
+     title = {Collaborative Filtering on a Family of Biological Targets},
+   journal = {J. Chem. Inf. Model.},
+    volume = {46},
+    number = {2},
+      year = {2006},
+     pages = {626--635},
+  abstract = {Building a QSAR model of a new biological target for which few screening data are available is a statistical
+challenge. However, the new target may be part of a bigger family, for which we have more screening data.
+Collaborative filtering or, more generally, multi-task learning, is a machine learning approach that improves
+the generalization performance of an algorithm by using information from related tasks as an inductive
+bias. We use collaborative filtering techniques for building predictive models that link multiple targets to
+multiple examples. The more commonalities between the targets, the better the multi-target model that can
+be built. We show an example of a multi-target neural network that can use family information to produce
+a predictive model of an undersampled target. We evaluate JRank, a kernel-based method designed for
+collaborative filtering. We show their performance on compound prioritization for an HTS campaign and
+the underlying shared representation between targets. JRank outperformed the neural network both in the
+single- and multi-target models.},
+topics={Bioinformatic,MultiTask},cat={J},
+}
+
+@TECHREPORT{collobert:2001:rr01-12,
+       author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
+        title = {A Parallel Mixture of {SVM}s for Very Large Scale Problems},
+       number = {12},
+         year = {2001},
+  institution = {IDIAP},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/IDIAP-RR-01-12.ps},
+     abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
+topics={Kernel},cat={T},
+}
+
+@ARTICLE{collobert:2002,
+    author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua},
+     title = {Parallel Mixture of {SVM}s for Very Large Scale Problem},
+   journal = {Neural Computation},
+      year = {2002},
+  abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.},
+topics={HighDimensional,Kernel},cat={J},
+}
+
+@BOOK{collobert:2002:book,
+     author = {Collobert, Ronan and Bengio, Yoshua and Bengio, Samy},
+     editor = {Lee, S. W. and Verri, A.},
+      title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
+  booktitle = {Pattern Recognition with Support Vector Machines},
+     series = {Lecture Notes in Computer Science},
+     volume = {2388},
+       year = {2002},
+  publisher = {Springer-Verlag},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/2002_mixtures_svm.pdf},
+   abstract = {A challenge for statistical learning is to deal with large data sets, e.g. in data mining. Popular learning algorithms such as Support Vector Machines have training time at least quadratic in the number of examples: they are hopeless to solve prolems with a million examples. We propose a "hard parallelizable mixture" methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a "gater" model in such a way that it becoms easy to learn an "expert" model separately in each region of the parition. A probabilistic extension and the use of a set of generative models allows representing a gater so that all pieces of the model are locally trained. For {SVM}s, time complexity appears empirically to locally grow linearly with the number of examples, while generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.},
+topics={Kernel},cat={B},
+}
+
+@MISC{copyright-CTAI,
+        author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Dorion, Christian},
+         title = {Commodity Trading Advisor Index},
+          year = {2004-2009},
+  howpublished = {copyright, and commercialized software license.}
+}
+
+@MISC{copyright-PLearn,
+        author = {Vincent, Pascal and Bengio, Yoshua},
+         title = {{PLearn}, a {C++} Machine Learning Library},
+          year = {1998-2009},
+  howpublished = {copyright, public domain license.},
+           url = {www.plearn.org}
+}
+
+@ARTICLE{Cosi90,
+    author = {Cosi, Piero and Bengio, Yoshua and De Mori, Renato},
+     title = {Phonetically-based multi-layered networks for acoustic property extraction and automatic speech recognition},
+   journal = {Speech Communication},
+    volume = {9},
+    number = {1},
+      year = {1990},
+     pages = {15--30},
+topics={PriorKnowledge,Speech},cat={J},
+}
+
+@INCOLLECTION{courville+eck+bengio:nips2009,
+     author = {Courville, Aaron and Eck, Douglas and Bengio, Yoshua},
+     editor = {},
+      title = {An Infinite Factor Model Hierarchy Via a Noisy-Or Mechanism},
+  booktitle = {Neural Information Processing Systems Conference (NIPS) 22},
+       year = {2009},
+      pages = {405--413},
+  publisher = {},
+        url = {http://books.nips.cc/papers/files/nips22/NIPS2009_1100.pdf},
+source={OwnPublication},
+sourcetype={Conference},
+pdf={""},
+}
+
+@INPROCEEDINGS{davies+plumbley+eck:waspaa2009,
+        author = {Davies, M. and Plumbley, M. and Eck, Douglas},
+         title = {Towards a musical beat emphasis function},
+     booktitle = {Proceedings of IEEE WASPAA},
+          year = {2009},
+  organization = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{Delalleau+al-2005,
+     author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
+     editor = {Cowell, Robert G. and Ghahramani, Zoubin},
+      title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
+  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)},
+       year = {2005},
+      pages = {96--103},
+  publisher = {Society for Artificial Intelligence and Statistics},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/semisup_aistats2005.pdf},
+   abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed nonparametric algorithms which provide an estimated continuous label for the given unlabeled examples. First, it extends them to function induction algorithms that minimize a regularization criterion applied to an out-of-sample example, and happen to have the form of Parzen windows regressors. This allows to predict test labels without solving again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Second, this function induction procedure gives rise to an efficient approximation of the training process, reducing the linear system to be solved to m << n unknowns, using only a subset of m examples. An improvement of O(n^2/m^2) in time can thus be obtained. Comparative experiments are presented, showing the good performance of the induction formula and approximation algorithm.},
+topics={Unsupervised},cat={C},
+}
+
+@INCOLLECTION{Delalleau+al-ssl-2006,
+     author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas},
+     editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
+      title = {Large-Scale Algorithms},
+  booktitle = {Semi-Supervised Learning},
+       year = {2006},
+      pages = {333--341},
+  publisher = {{MIT} Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/delalleau_ssl.pdf},
+   abstract = {In Chapter 11, it is shown how a number of graph-based semi-supervised learning
+algorithms can be seen as the minimization of a specific cost function, leading to a
+linear system with n equations and unknowns (with n the total number of labeled
+and unlabeled examples). Solving such a linear system will in general require on the
+order of O(kn2) time and O(kn) memory (for a sparse graph where each data point
+has k neighbors), which can be prohibitive on large datasets (especially if k = n,
+i.e. the graph is dense). We present in this chapter a subset selection method that
+can be used to reduce the original system to one of size m << n. The idea is to solve
+for the labels of a subset S of X of only m points, while still retaining information
+from the rest of the data by approximating their label with a linear combination of
+the labels in S (using the induction formula presented in Chapter 11). This leads
+to an algorithm whose computational requirements scale as O(m2n) and memory
+requirements as O(m2), thus allowing one to take advantage of significantly bigger
+unlabeled datasets than with the original algorithms.},
+cat={B},topics={Unsupervised},
+}
+
+@INCOLLECTION{DeMori90a,
+     author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero},
+     editor = {Mohr, R. and Pavlidis, T. and Sanfelin, A.},
+      title = {On the use of an ear model and multi-layer networks for automatic speech recognition},
+  booktitle = {Structural Pattern Analysis},
+       year = {1990},
+  publisher = {World Scientific},
+topics={PriorKnowledge,Speech},cat={B},
+}
+
+@INPROCEEDINGS{Desjardins+al-2010,
+     author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua},
+      title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine},
+  booktitle = {Proceedings of AISTATS 2010},
+     volume = {9},
+       year = {2010},
+      pages = {145-152},
+   abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.}
+}
+
+@TECHREPORT{Desjardins-2008,
+       author = {Desjardins, Guillaume and Bengio, Yoshua},
+     keywords = {Convolutional Architectures, Deep Networks, RBM, Vision},
+        title = {Empirical Evaluation of Convolutional RBMs for Vision},
+       number = {1327},
+         year = {2008},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {Convolutional Neural Networks ({CNN}) have had great success in machine learning tasks involving vision and represent one of the early successes of deep networks. Local receptive fields and weight
+sharing make their architecture ideally suited for vision tasks by helping to enforce a prior based on our knowledge of natural images. This same prior could also be applied to recent developments in the field of deep networks, in order to tailor these new architectures for artificial vision. In this context, we show how the Restricted {Boltzmann} Machine (RBM), the building block of Deep Belief Networks (DBN), can be adapted to operate in a convolutional manner. We compare their performance to standard fully-connected RBMs on a simple visual learning task and show that the convolutional RBMs (CRBMs) converge to smaller values of the negative likelihood function. Our experiments also indicate that CRBMs are more efficient than standard RBMs trained on small image patches, with the CRBMs having faster convergence.}
+}
+
+@TECHREPORT{Desjardins-tech-2009,
+       author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal and Delalleau, Olivier},
+     keywords = {CD, PCD, RBM, simulated tempering, tempered MCMC, unsupervised learning},
+        title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machines},
+       number = {1345},
+         year = {2009},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we
+explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood that it helps both sampling and learning.}
+}
+
+@ARTICLE{Dugas+Bengio-2009,
+    author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
+     title = {Incorporating Functional Knowledge in Neural Networks},
+   journal = {The Journal of Machine Learning Research},
+    volume = {10},
+      year = {2009},
+     pages = {1239--1262},
+  abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in its two arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of Lipschitz functions with these and other properties. We apply this new class of functions to the task of modelling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.}
+}
+
+@PHDTHESIS{Dugas-Phd-2003,
+    author = {Dugas, Charles},
+     title = {Les algorithmes d'apprentissage appliqu{\'{e}}s aux risques financiers},
+      year = {2003},
+    school = {Universit{\'{e}} de Montr{\'{e}}al}
+}
+
+@ARTICLE{dugas:2003,
+    author = {Dugas, Charles and Bengio, Yoshua and Chapados, Nicolas and Vincent, Pascal and Denoncourt, Germain and Fournier, Christian},
+     title = {Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking},
+   journal = {CAS Forum},
+    volume = {1},
+    number = {1},
+      year = {2003},
+     pages = {179--214},
+  abstract = {We recently conducted a research project for a large North American automobile insurer. This study was the most exhaustive ever undertaken by this particular insurer and lasted over an entire year. We analyzed the discriminating power of each variable used for ratemaking. We analyzed the performance of several models within five broad categories: linear regressions, generalized linear models, decision trees, neural networks and support vector machines. In this paper, we present the main results of this study. We qualitatively compare models and show how neural networks can represent high-order nonlinear dependencies with a small number of parameters, each of which is estimated on a large proportion of the data, thus yielding low variance. We thoroughly explain the purpose of the nonlinear sigmoidal transforms which are at the very heart of neural networks' performances. The main numerical result is a statistically significant reduction in the out-of-sample mean-squared error using the neural network model and our ability to substantially reduce the median premium by charging more to the highest risks. This in turn can translate into substantial savings and financial benefits for an insurer. We hope this paper goes a long way towards convincing actuaries to include neural networks within their set of modeling tools for ratemaking.},
+topics={Finance,Mining},cat={J},
+}
+
+@INPROCEEDINGS{eck+bertinmahieux+lamere+green:nips2007,
+    author = {Eck, Douglas and Lamere, Paul and Bertin-Mahieux, Thierry and Green, Stephen},
+    editor = {Platt, John and Kolen, J. and Singer, Yoram and Roweis, S.},
+     title = {Automatic Generation of Social Tags for Music Recommendation},
+      year = {2008},
+  crossref = {NIPS20-shorter},
+source =       "OwnPublication"
+}
+
+@INPROCEEDINGS{eck+bertinmahieux+lamere:ismir2007,
+     author = {Eck, Douglas and Bertin-Mahieux, Thierry and Lamere, Paul},
+      title = {Autotagging music using supervised machine learning},
+  booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
+       year = {2007},
+source={OwnPublication},
+}
+
+@INPROCEEDINGS{eck+casagrande:ismir2005,
+     author = {Eck, Douglas and Casagrande, Norman},
+      title = {Finding Meter in Music Using an Autocorrelation Phase Matrix and Shannon Entropy},
+  booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
+       year = {2005},
+      pages = {504--509},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir.pdf},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INCOLLECTION{eck+gasser+port:2000,
+     author = {Eck, Douglas and Gasser, M. and Port, Robert},
+     editor = {Desain, P. and Windsor, L.},
+      title = {Dynamics and Embodiment in Beat Induction},
+  booktitle = {{Rhythm Perception and Production}},
+       year = {2000},
+      pages = {157--170},
+  publisher = {Swets and Zeitlinger},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2000_rppw.pdf},
+   abstract = {We provide an argument for using dynamical systems theory in the domain of beat induction. We motivate the study of beat induction and to relate beat induction to the more general study of human rhythm cognition. In doing so we compare a dynamical, embodied approach to a symbolic (traditional AI) one, paying particular attention to how the modeling approach brings with it tacit assumptions about what is being modeled. Please note that this is a philosophy paper about research that was, at the time of writing, very much in progress.},
+source={OwnPublication},
+sourcetype={Chapter},
+}
+
+@INPROCEEDINGS{eck+gasser:1996,
+     author = {Eck, Douglas and Gasser, M.},
+     editor = {},
+      title = {Perception of Simple Rhythmic Patterns in a Network of Oscillators},
+  booktitle = {{The Proceedings of the Eighteenth Annual Conference of the Cognitive Science Society}},
+       year = {1996},
+  publisher = {Lawrence Erlbaum Associates},
+   abstract = {This paper is concerned with the complex capacity to recognize and reproduce rhythmic patterns. While this capacity has not been well investigated, in broad qualitative terms it is clear that people can learn to identify and produce recurring patterns defined in terms of sequences of beats of varying intensity and rests: the rhythms behind waltzes, reels, sambas, etc. Our short term goal is a model which is "hard-wired" with knowledge of a set of such patterns. Presented with a portion of one of the patterns or a label for a pattern, the model should reproduce the pattern and continue to do so when the input is turned off. Our long-term goal is a model which can learn to adjust the connection strengths which implement particular patterns as it is exposed to input patterns.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@TECHREPORT{eck+graves+schmidhuber:tr-speech2003,
+       author = {Eck, Douglas and Graves, A. and Schmidhuber, Juergen},
+        title = {A New Approach to Continuous Speech Recognition Using {LSTM} Recurrent Neural Networks},
+       number = {IDSIA-14-03},
+         year = {2003},
+  institution = {IDSIA},
+     abstract = {This paper presents an algorithm for continuous speech recognition built from two Long Short-Term Memory ({LSTM}) recurrent neural networks. A first {LSTM} network performs frame-level phone probability estimation. A second network maps these phone predictions onto words. In contrast to {HMM}s, this allows greater exploitation of long-timescale correlations. Simulation results are presented for a hand-segmented subset of the "Numbers-95" database. These results include isolated phone prediction, continuous frame-level phone prediction and continuous word prediction. We conclude that despite its early stage of development, our new model is already competitive with existing approaches on certain aspects of speech recognition and promising on others, warranting further research.},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@TECHREPORT{eck+lapalme:2008,
+       author = {Eck, Douglas and Lapalme, J.},
+        title = {Learning Musical Structure Directly from Sequences of Music},
+       number = {1300},
+         year = {2008},
+  institution = {Universit{\'{e}} de Montr{\'{e}}al DIRO},
+          url = {http://www.iro.umontreal.ca/~eckdoug/papers/tr1300.pdf},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@INPROCEEDINGS{eck+schmidhuber:icann2002,
+     author = {Eck, Douglas and Schmidhuber, Juergen},
+     editor = {Dorronsoro, J.},
+      title = {Learning The Long-Term Structure of the Blues},
+  booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
+     volume = {},
+       year = {2002},
+      pages = {284--289},
+  publisher = {Springer},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_icannMusic.pdf},
+   abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, they have been unable to learn an entire musical form and use that knowledge to guide composition. In this study, we describe model details and present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and some listeners believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{eck+schmidhuber:ieee2002,
+     author = {Eck, Douglas and Schmidhuber, Juergen},
+     editor = {Bourlard, H.},
+      title = {Finding Temporal Structure in Music: Blues Improvisation with {LSTM} Recurrent Networks},
+  booktitle = {Neural Networks for Signal Processing XII, Proceedings of the 2002 IEEE Workshop},
+       year = {2002},
+      pages = {747--756},
+  publisher = {IEEE},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_ieee.pdf},
+   abstract = {Few types of signal streams are as ubiquitous as music. Here we consider the problem of extracting essential ingredients of music signals, such as well-defined global temporal structure in the form of nested periodicities (or {\em meter}). Can we construct an adaptive signal processing device that learns by example how to generate new instances of a given musical style? Because recurrent neural networks can in principle learn the temporal structure of a signal, they are good candidates for such a task. Unfortunately, music composed by standard recurrent neural networks ({RNN}s) often lacks global coherence. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and learning of context sensitive languages. In the current study we show that {LSTM} is also a good mechanism for learning to compose music. We present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and we believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@ARTICLE{eck+scott:2005,
+    author = {Eck, Douglas and Scott, S. K.},
+     title = {Editorial: New Research in Rhythm Perception and Production},
+   journal = {Music Perception},
+    volume = {22},
+    number = {3},
+      year = {2005},
+     pages = {371-388},
+source={OwnPublication},
+sourcetype={Other},
+}
+
+@MISC{eck+scott:editor2005,
+    author = {Eck, Douglas and Scott, S. K.},
+     title = {Music Perception},
+      year = {2005},
+      note = {Guest Editor, Special Issue on Rhythm Perception and Production, 22(3)},
+source={OwnPublication},
+sourcetype={Other},
+}
+
+@INPROCEEDINGS{eck:1999,
+     author = {Eck, Douglas},
+     editor = {},
+      title = {Learning Simple Metrical Preferences in a Network of {F}itzhugh-{N}agumo Oscillators},
+  booktitle = {{The Proceedings of the Twenty-First Annual Conference of the Cognitive Science Society}},
+       year = {1999},
+  publisher = {Lawrence Erlbaum Associates},
+   abstract = {Hebbian learning is used to train a network of oscillators to prefer periodic signals of pulses over aperiodic signals. Target signals consisted of metronome-like voltage pulses with varying amounts of inter-onset noise injected. (with 0\% noise yielding a periodic signal and more noise yielding more and more aperiodic signals.) The oscillators---piecewise-linear approximations (Abbott, 1990) to Fitzhugh-Nagumo oscillators---are trained using mean phase coherence as an objective function. Before training a network is shown to readily synchronize with signals having wide range of noise. After training on a series of noise-free signals, a network is shown to only synchronize with signals having little or no noise. This represents a bias towards periodicity and is explained by strong positive coupling connections between oscillators having harmonically-related periods.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@UNPUBLISHED{eck:bramsworkshop2004,
+    author = {Eck, Douglas},
+     title = {Challenges for Machine Learning in the Domain of Music},
+      year = {2004},
+      note = {BRAMS Workshop on Brain and Music, Montreal Neurological Institute},
+  abstract = {Slides and musical examples available on request.},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@PHDTHESIS{eck:diss,
+    author = {Eck, Douglas},
+     title = {{Meter Through Synchrony: Processing Rhythmical Patterns with Relaxation Oscillators}},
+      year = {2000},
+    school = {Indiana University, Bloomington, IN, www.idsia.ch/\-\~{}doug/\-publications.html},
+  abstract = {This dissertation uses a network of relaxation oscillators to beat along with temporal signals. Relaxation oscillators exhibit interspersed slow-fast movement and model a wide array of biological oscillations. The model is built up gradually: first a single relaxation oscillator is exposed to rhythms and shown to be good at finding downbeats in them. Then large networks of oscillators are mutually coupled in an exploration of their internal synchronization behavior. It is demonstrated that appropriate weights on coupling connections cause a network to form multiple pools of oscillators having stable phase relationships. This is a promising first step towards networks that can recreate a rhythmical pattern from memory. In the full model, a coupled network of relaxation oscillators is exposed to rhythmical patterns. It is shown that the network finds downbeats in patterns while continuing to exhibit good internal stability. A novel non-dynamical model of downbeat induction called the Normalized Positive (NP) clock model is proposed, analyzed, and used to generate comparison predictions for the oscillator model. The oscillator model compares favorably to other dynamical approaches to beat induction such as adaptive oscillators. However, the relaxation oscillator model takes advantage of intrinsic synchronization stability to allow the creation of large coupled networks. This research lays the groundwork for a long-term research goal, a robotic arm that responds to rhythmical signals by tapping along. It also opens the door to future work in connectionist learning of long rhythmical patterns.},
+source={OwnPublication},
+sourcetype={Thesis},
+}
+
+@INPROCEEDINGS{eck:icann2001,
+     author = {Eck, Douglas},
+     editor = {Dorffner, Georg},
+      title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
+  booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
+     volume = {},
+       year = {2001},
+      pages = {1239--1247},
+  publisher = {Springer},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_icann.pdf},
+   abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{eck:icassp2007,
+     author = {Eck, Douglas},
+     editor = {},
+      title = {Beat Tracking Using an Autocorrelation Phase Matrix},
+  booktitle = {{Proceedings of the 2007 International Conference on Acoustics, Speech and Signal Processing (ICASSP)}},
+       year = {2007},
+      pages = {1313--1316},
+  publisher = {IEEE Signal Processing Society},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_icassp.pdf},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{eck:icmpc2004,
+     author = {Eck, Douglas},
+     editor = {Lipscomb, S. D. and Ashley, R. and Gjerdingen, R. O. and Webster, P.},
+      title = {A Machine-Learning Approach to Musical Sequence Induction That Uses Autocorrelation to Bridge Long Timelags},
+  booktitle = {{The Proceedings of the Eighth International Conference on Music Perception and Cognition ({ICMPC}8)}},
+       year = {2004},
+      pages = {542-543},
+  publisher = {Causal Productions},
+   abstract = {One major challenge in using statistical sequence learning methods in the domain of music lies in bridging the long timelags that separate important musical events. Consider, for example, the chord changes that convey the basic structure of a pop song. A sequence learner that cannot predict chord changes will almost certainly not be able to generate new examples in a musical style or to categorize songs by style. Yet, it is surprisingly difficult for a sequence learner to bridge the long timelags necessary to identify when a chord change will occur and what its new value will be. This is the case because chord changes can be separated by dozens or hundreds of intervening notes. One could solve this problem by treating chords as being special (as did Mozer, NIPS 1991). But this is impractical---it requires chords to be labeled specially in the dataset, limiting the applicability of the model to non-labeled examples---and furthermore does not address the general issue of nested temporal structure in music. I will briefly describe this temporal structure (known commonly as "meter") and present a model that uses to its advantage an assumption that sequences are metrical. The model consists of an autocorrelation-based filtration that estimates online the most likely metrical tree (i.e. the frequency and phase of beat, measure, phrase &etc.) and uses that to generate a series of sequences varying at different rates. These sequences correspond to each level in the hierarchy. Multiple learners can be used to treat each series separately and their predictions can be combined to perform composition and categorization. I will present preliminary results that demonstrate the usefulness of this approach. Time permitting I will also compare the model to alternate approaches.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{eck:icmpc2006,
+     author = {Eck, Douglas},
+     editor = {Baroni, M. and Addessi, A. R. and Caterina, R. and Costa, M.},
+      title = {Beat Induction Using an Autocorrelation Phase Matrix},
+  booktitle = {The Proceedings of the 9th International Conference on Music Perception and Cognition ({ICMPC9})},
+       year = {2006},
+      pages = {931-932},
+  publisher = {Causal Productions},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@UNPUBLISHED{eck:irisworkshop2004,
+    author = {Eck, Douglas},
+     title = {Using Autocorrelation to Bridge Long Timelags when Learning Sequences of Music},
+      year = {2004},
+      note = {IRIS 2004 Machine Learning Workshop, Ottawa, Canada},
+  abstract = {Slides and musical examples available on request.},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@ARTICLE{eck:jnmr2001,
+    author = {Eck, Douglas},
+     title = {A Positive-Evidence Model for Rhythmical Beat Induction},
+   journal = {Journal of New Music Research},
+    volume = {30},
+    number = {2},
+      year = {2001},
+     pages = {187--200},
+  abstract = {The Normalized Positive (NPOS) model is a rule-based model that predicts downbeat location and pattern complexity in rhythmical patterns. Though derived from several existing models, the NPOS model is particularly effective at making correct predictions while at the same time having low complexity. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on beat induction.},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@UNPUBLISHED{eck:mipsworkshop2004,
+    author = {Eck, Douglas},
+     title = {Bridging Long Timelags in Music},
+      year = {2004},
+      note = {NIPS 2004 Workshop on Music and Machine Learning (MIPS), Whistler, British Columbia},
+  abstract = {Slides and musical examples available on request.},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@ARTICLE{eck:mp2006,
+    author = {Eck, Douglas},
+     title = {Finding Long-Timescale Musical Structure with an Autocorrelation Phase Matrix},
+   journal = {Music Perception},
+    volume = {24},
+    number = {2},
+      year = {2006},
+     pages = {167--176},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@UNPUBLISHED{eck:nipsworkshop2003,
+    author = {Eck, Douglas},
+     title = {Time-warped hierarchical structure in music and speech: A sequence prediction challenge},
+      year = {2003},
+      note = {NIPS 2003 Workshop on Recurrent Neural Networks, Whistler, British Columbia},
+  abstract = {Slides and musical examples available on request.},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@UNPUBLISHED{eck:nipsworkshop2006,
+    author = {Eck, Douglas},
+     title = {Generating music sequences with an echo state network},
+      year = {2006},
+      note = {NIPS 2006 Workshop on Echo State Networks and Liquid State Machines},
+  abstract = {Slides and musical examples available on request.},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@UNPUBLISHED{eck:nipsworkshop2007,
+    author = {Eck, Douglas},
+     title = {Measuring and modeling musical expression},
+      year = {2007},
+      note = {NIPS 2007 Workshop on Music, Brain and Cognition},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@ARTICLE{eck:psyres2002,
+    author = {Eck, Douglas},
+     title = {Finding Downbeats with a Relaxation Oscillator},
+   journal = {Psychol. Research},
+    volume = {66},
+    number = {1},
+      year = {2002},
+     pages = {18--25},
+  abstract = {A relaxation oscillator model of neural spiking dynamics is applied to the task of finding downbeats in rhythmical patterns. The importance of downbeat discovery or {\em beat induction} is discussed, and the relaxation oscillator model is compared to other oscillator models. In a set of computer simulations the model is tested on 35 rhythmical patterns from Povel \& Essens (1985). The model performs well, making good predictions in 34 of 35 cases. In an analysis we identify some shortcomings of the model and relate model behavior to dynamical properties of relaxation oscillators.},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@UNPUBLISHED{eck:rppw2005,
+    author = {Eck, Douglas},
+     title = {Meter and Autocorrelation},
+      year = {2005},
+      note = {{10th Rhythm Perception and Production Workshop (RPPW), Alden Biesen, Belgium}},
+source={OwnPublication},
+sourcetype={Workshop},
+}
+
+@TECHREPORT{eck:tr-music2002,
+       author = {Eck, Douglas and Schmidhuber, Juergen},
+        title = {A First Look at Music Composition using {LSTM} Recurrent Neural Networks},
+       number = {IDSIA-07-02},
+         year = {2002},
+  institution = {IDSIA},
+     abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, attempts at learning an entire musical form and using that knowledge to guide composition have been unsuccessful. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and CSL learning. In the current study I show that {LSTM} is also a good mechanism for learning to compose music. I compare this approach to previous attempts, with particular focus on issues of data representation. I present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and I believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen. {\em Note: This is a more complete version of the 2002 ICANN submission Learning the Long-Term Structure of the Blues.}},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@TECHREPORT{eck:tr-npos2000,
+       author = {Eck, Douglas},
+        title = {A Positive-Evidence Model for Classifying Rhythmical Patterns},
+       number = {IDSIA-09-00},
+         year = {2000},
+  institution = {IDSIA},
+     abstract = {The Normalized Positive (NPOS) model is a novel matching model that predicts downbeat location and pattern complexity in rhythmical patterns. Though similar models report success, the NPOS model is particularly effective at making these predictions while at the same time being theoretically and mathematically simple. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on rhythm perception.\\ {\em Note: See the 2001 Journal of New Music Research paper "A Positive-Evidence Model for Rhythmical Beat Induction" for a newer version of this paper.}},
+ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-09-00.ps.gz},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@TECHREPORT{eck:tr-oscnet2001,
+       author = {Eck, Douglas},
+        title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms},
+       number = {IDSIA-06-01},
+         year = {2001},
+  institution = {IDSIA},
+     abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators. \\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
+ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-06-01.ps.gz},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@TECHREPORT{eck:tr-tracking2000,
+       author = {Eck, Douglas},
+        title = {Tracking Rhythms with a Relaxation Oscillator},
+       number = {IDSIA-10-00},
+         year = {2000},
+  institution = {IDSIA},
+     abstract = {A number of biological and mechanical processes are typified by a continued slow accrual and fast release of energy. A nonlinear oscillator exhibiting this slow-fast behavior is called a relaxation oscillator and is used to model, for example, human heartbeat pacemaking and neural action potential. Similar limit cycle oscillators are used to model a wider range of behaviors including predator-prey relationships and synchrony in animal populations such as fireflies. Though nonlinear limit-cycle oscillators have been successfully applied to beat induction, relaxation oscillators have received less attention. In this work we offer a novel and effective relaxation oscillator model of beat induction. We outline the model in detail and provide a perturbation analysis of its response to external stimuli. In a series of simulations we expose the model to patterns from Experiment 1 of Povel \& Essens (1985). We then examine the beat assignments of the model. Although the overall performance of the model is very good, there are shortcomings. We believe that a network of mutually-coupled oscillators will address many of these shortcomings, and we suggest an appropriate course for future research.\\ {\em Note: See the 2001 {\em Psychological Research} article "Finding Downbeats with a Relaxation Oscillator" for a revised but less detailed version of this paper.}},
+ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-10-00.ps.gz},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@TECHREPORT{eck:tr-tracking2002,
+       author = {Eck, Douglas},
+        title = {Real-Time Musical Beat Induction with Spiking Neural Networks},
+       number = {IDSIA-22-02},
+         year = {2002},
+  institution = {IDSIA},
+     abstract = {Beat induction is best described by analogy to the activities of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to perform automatically. We will present a model of beat induction that uses a spiking neural network as the underlying synchronization mechanism. This approach has some advantages over existing methods; it runs online, responds at many levels in the metrical hierarchy, and produces good results on performed music (Beatles piano performances encoded as MIDI). In this paper the model is described in some detail and simulation results are discussed.},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@UNPUBLISHED{eck:verita2002,
+    author = {Eck, Douglas},
+     title = {Real Time Beat Induction with Spiking Neurons},
+      year = {2002},
+      note = {{Music, Motor Control and the Mind: Symposium at Monte Verita, May}},
+  abstract = {Beat induction is best described by analogy to the activites of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to model. I will discuss an approach to beat induction that uses a network of spiking neurons to synchronize with periodic components in a signal at many timescales. Through a competitive process, groups of oscillators embodying a particular metrical interpretation (e.g. \"4/4\") are selected from the network and used to track the pattern. I will compare this model to other approaches including a traditional symbolic AI system (Dixon 2001), and one based on Bayesian statistics (Cemgil et al, 2001). Finally I will present performance results of the network on a set of MIDI-recorded piano performances of Beatles songs collected by the Music, Mind, Machine Group, NICI, University of Nijmegen (see Cemgil et al, 2001 for more details or http://www.nici.kun.nl/mmm).},
+source={OwnPublication},
+sourcetype={Workshop},
+}
+
+@INPROCEEDINGS{ElHihi+Bengio-nips8,
+    author = {El Hihi, Salah and Bengio, Yoshua},
+     title = {Hierarchical Recurrent Neural Networks for Long-Term Dependencies},
+      year = {1996},
+  crossref = {NIPS8-shorter},
+  abstract = {We have already shown that extracting lone-term dependencies from sequential data is difficult, both for deterministic dynamical systems such as recurrent networks, and probabilistic models such as hidden {Markov} models ({HMM}s) or input/output hidden {Markov} models ({IOHMM}s). In practice, to avoid this problem, researchers have used domain specific a-priori knowledge to give meaning to the hidden or state variables representing past context. In this paper we propose to use a more general type of a-priori knowledge, namely that the temporal dependencies are structured hierarchically. This implies that long-term dependencies are represented by variables with a long time scale. This principle is applied to a recurrent network which includes delays and multiple time scales. Experiments confirm the advantages of such structures. A similar approach is proposed for {HMM}s and {IOHMM}s.},
+topics={LongTerm},cat={C},
+}
+
+@ARTICLE{Erhan+al-2010,
+    author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
+     title = {Why Does Unsupervised Pre-training Help Deep Learning?},
+    volume = {11},
+      year = {2010},
+     pages = {625--660},
+  crossref = {JMLR-shorter},
+  abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
+}
+
+@INPROCEEDINGS{Erhan-aistats-2010,
+     author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
+      title = {Why Does Unsupervised Pre-training Help Deep Learning?},
+  booktitle = {Proceedings of AISTATS 2010},
+     volume = {9},
+       year = {2010},
+      pages = {201-208},
+   abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants with impressive results being obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks often involve an unsupervised learning component, usually in an unsupervised pre-training phase. The main question investigated here is the following: why does unsupervised pre-training work so well? Through extensive experimentation, we explore several possible explanations discussed in the literature including its action as a regularizer (Erhan et al. 2009) and as an aid to optimization (Bengio et al. 2007). Our results build on the work of Erhan et al. 2009, showing that unsupervised pre-training appears to play predominantly a regularization role in subsequent supervised training. However our results in an online setting, with a virtually unlimited data stream, point to a somewhat more nuanced interpretation of the roles of optimization and regularization in the unsupervised pre-training effect.}
+}
+
+@MASTERSTHESIS{Erhan-MSc,
+    author = {Erhan, Dumitru},
+  keywords = {Apprentisage multit{\^{a}}che, Filtrage collaboratif, M{\'{e}}thodes {\`{a}} noyaux, QSAR, R{\'{e}}seaux de neurones},
+     title = {Collaborative filtering techniques for drug discovery},
+      year = {2006},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {Cette th{\`{e}}se examine le probl{\`{e}}me d'apprendre plusieurs t{\^{a}}ches simultan{\'{e}}ment,
+afin de transf{\'{e}}rer les connaissances apprises {\`{a}} une nouvelle t{\^{a}}che. Si
+on suppose que les t{\^{a}}ches partagent une repr{\'{e}}sentation et qu'il est possible de
+d{\'{e}}couvrir cette repr{\'{e}}sentation efficacement, cela peut nous servir {\`{a}} construire un
+meilleur mod{\`{e}}le de la nouvelle t{\^{a}}che. Il existe plusieurs variantes de
+cette m{\'{e}}thode: transfert inductif, apprentisage multit{\^{a}}che, filtrage
+collaboratif etc. Nous avons {\'{e}}valu{\'{e}} plusieurs algorithmes d'apprentisage
+supervis{\'{e}} pour d{\'{e}}couvrir des repr{\'{e}}sentations partag{\'{e}}es parmi les
+t{\^{a}}ches d{\'{e}}finies dans un probl{\`{e}}me de chimie computationelle. Nous avons
+formul{\'{e}} le probl{\`{e}}me dans un cadre d'apprentisage automatique,
+fait l'analogie avec les algorithmes standards de filtrage collaboratif et construit les
+hypoth{\`{e}}ses g{\'{e}}n{\'{e}}rales qui devraient {\^{e}}tre test{\'{e}}es pour valider l'utilitisation des
+algorithmes multit{\^{a}}che. Nous avons aussi {\'{e}}valu{\'{e}} la performance des algorithmes
+d'apprentisage utilis{\'{e}}s et d{\'{e}}montrons qu'il est, en effet, possible de trouver une
+repr{\'{e}}sentation partag{\'{e}}e pour le probl{\`{e}}me consider{\'{e}}. Du point de vue
+th{\'{e}}orique, notre apport est une modification d'un algorithme
+standard---les machines {\`{a}} vecteurs de support--qui produit des r{\'{e}}sultats
+comparables aux meilleurs algorithmes disponsibles et qui utilise {\`{a}} fond les
+concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre
+apport est l'utilisation de notre algorithme par les compagnies
+pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.}
+}
+
+@INPROCEEDINGS{Erhan2009,
+    author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
+  keywords = {Deep Networks},
+     title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training},
+      year = {2009},
+     pages = {153--160},
+  crossref = {xAISTATS2009-shorter},
+  abstract = {Whereas theoretical work suggests that deep architectures might be more efficient at representing highly-varying functions, training deep architectures was unsuccessful until the recent advent of algorithms based on unsupervised pretraining. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. Answering these questions is important if learning in deep architectures is to be further improved. We attempt to shed some light on these questions through extensive simulations. The experiments confirm and clarify the advantage of unsupervised pre-training. They demonstrate the robustness of the training procedure with respect to the random initialization, the positive effect of pre-training in terms of optimization and its role as a regularizer. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples.}
+}
+
+@ARTICLE{gasser+eck+port:1999,
+    author = {Gasser, M. and Eck, Douglas and Port, Robert},
+     title = {Meter as Mechanism: A Neural Network Model that Learns Metrical patterns},
+   journal = {Connection Science},
+    volume = {11},
+    number = {2},
+      year = {1999},
+     pages = {187--216},
+  abstract = {One kind of prosodic structure that apparently underlies both music and some examples of speech production is meter. Yet detailed measurements of the timing of both music and speech show that the nested periodicities that define metrical structure can be quite noisy in time. What kind of system could produce or perceive such variable metrical timing patterns? And what would it take to be able to store and reproduce particular metrical patterns from long-term memory? We have developed a network of coupled oscillators that both produces and perceives patterns of pulses that conform to particular meters. In addition, beginning with an initial state with no biases, it can learn to prefer the particular meter that it has been previously exposed to.},
+own={Have},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@TECHREPORT{gasser+eck+port:tr-1996,
+       author = {Gasser, M. and Eck, Douglas and Port, Robert},
+        title = {Meter as Mechanism A Neural Network that Learns Metrical Patterns},
+       number = {180},
+         year = {1996},
+  institution = {Indiana University Cognitive Science Program},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@INPROCEEDINGS{gasser+eck:1996,
+     author = {Gasser, M. and Eck, Douglas},
+     editor = {},
+      title = {Representing Rhythmic Patterns in a Network of Oscillators},
+  booktitle = {{The Proceedings of the International Conference on Music Perception and Cognition}},
+     number = {4},
+       year = {1996},
+      pages = {361--366},
+  publisher = {Lawrence Erlbaum Associates},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/1996_gasser_icmpc.pdf},
+   abstract = {This paper describes an evolving computational model of the perception and pro-duction of simple rhythmic patterns. The model consists of a network of oscillators of different resting frequencies which couple with input patterns and with each other. Os-cillators whose frequencies match periodicities in the input tend to become activated. Metrical structure is represented explicitly in the network in the form of clusters of os-cillators whose frequencies and phase angles are constrained to maintain the harmonic relationships that characterize meter. Rests in rhythmic patterns are represented by ex-plicit rest oscillators in the network, which become activated when an expected beat in the pattern fails to appear. The model makes predictions about the relative difficulty of patterns and the effect of deviations from periodicity in the input.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{gers+eck+schmidhuber:icann2001,
+     author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
+     editor = {Dorffner, Georg},
+      title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
+  booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}},
+       year = {2001},
+      pages = {669--676},
+  publisher = {Springer},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_gers_icann.pdf},
+   abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@TECHREPORT{gers+eck+schmidhuber:tr-2000,
+       author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
+        title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches},
+       number = {IDSIA-22-00},
+         year = {2000},
+  institution = {IDSIA},
+     abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.\\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}},
+ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-22-00.ps.gz},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@INPROCEEDINGS{gers+perez+eck+schmidhuber:esann2002,
+     author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
+      title = {{DEKF-LSTM}},
+  booktitle = {Proceedings of the 10th European Symposium on Artificial Neural Networks, ESANN 2002},
+       year = {2002},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{gers+perez+eck+schmidhuber:icannA2002,
+     author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen},
+     editor = {Dorronsoro, J.},
+      title = {Learning Context Sensitive Languages with {LSTM} Trained with {Kalman} Filters},
+  booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
+       year = {2002},
+      pages = {655--660},
+  publisher = {Springer},
+   abstract = {Unlike traditional recurrent neural networks, the Long Short-Term Memory ({LSTM}) model generalizes well when presented with training sequences derived from regular and also simple nonregular languages. Our novel combination of {LSTM} and the decoupled extended Kalman filter, however, learns even faster and generalizes even better, requiring only the 10 shortest exemplars n <= 10 of the context sensitive language a^nb^nc^n to deal correctly with values of n up to 1000 and more. Even when we consider the relatively high update complexity per timestep, in many cases the hybrid offers faster learning than {LSTM} by itself.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@PHDTHESIS{Ghosn-Phd-2003,
+    author = {Ghosn, Joumana},
+     title = {Apprentissage multi-t{\^{a}}ches et partage de connaissances},
+      year = {2003},
+    school = {Universit{\'{e}} de Montr{\'{e}}al}
+}
+
+@INPROCEEDINGS{ghosn97,
+     author = {Ghosn, Joumana and Bengio, Yoshua},
+      title = {Multi-Task Learning for Stock Selection},
+       year = {1997},
+      pages = {946--952},
+  publisher = {MIT Press, Cambridge, MA},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/multitask-nips97.pdf},
+   crossref = {NIPS9},
+   abstract = {Artificial Neural Networks can be used to predict future returns of stocks in order to take financial decisions. Should one build a separate network for each stock or share the same network for all the stocks. In this paper we also explore other alternatives, in which some layers are shared and others are not shared. When the prediction of future returns for different stocks are viewed as different tasks, sharing some parameters across stocks is a form of multi-task learning. In a series of experiments with Canadian stocks, we obtain yearly returns that are more than 14\% above various benchmarks.},
+topics={MultiTask,Finance},cat={C},
+}
+
+@TECHREPORT{Gingras-asynchronous-TR96,
+       author = {Gingras, Fran{\c c}ois and Bengio, Yoshua},
+        title = {Handling asynchronous or missing financial data with recurrent networks},
+       number = {1020},
+         year = {1996},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+topics={Finance,Missing},cat={T},
+}
+
+@TECHREPORT{Gingras-financial-TR99,
+       author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
+        title = {On Out-of-Sample Statistics for Financial Time-Series},
+       number = {2585},
+         year = {1999},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+topics={Comparative,Finance},cat={T},
+}
+
+@INPROCEEDINGS{gingras2000,
+     author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude},
+      title = {On Out-of-Sample Statistics for Time-Series},
+  booktitle = {Computational Finance 2000},
+       year = {2000},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/out-err-cf2000.pdf},
+   abstract = {This paper studies an out-of-sample statistic for time-series prediction that is analogous to the widely used R2 in-sample statistic. We propose and study methods to estimate the variance of this out-of-sample statistic. We suggest that the out-of-sample statistic is more robust to distributional and asymptotic assumptions behind many tests for in-sample statistics. Furthermore we argue that it may be more important in some cases to choose a model that generalizes as well as possible rather than choose the parameters that are closest to the true parameters. Comparative experiments are performed on a financial time-series (daily and monthly returns of the TSE300 index). The experiments are performed or varying prediction horizons and we study the relation between predictibility (out-of-sample R2), variability of the out-of-sample R2 statistic, and the prediction horizon.},
+topics={Comparative,Finance},cat={C},
+}
+
+@INPROCEEDINGS{GlorotAISTATS2010,
+     author = {Bengio, Yoshua and Glorot, Xavier},
+      title = {Understanding the difficulty of training deep feedforward neural networks},
+  booktitle = {Proceedings of AISTATS 2010},
+     volume = {9},
+       year = {2010},
+      pages = {249-256},
+   abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}
+}
+
+@INPROCEEDINGS{Gori89,
+     author = {Gori, Marco and Bengio, Yoshua and De Mori, Renato},
+      title = {BPS: a learning algorithm for capturing the dynamic nature of speech},
+  booktitle = {International Joint Conference on Neural Networks},
+     volume = {2},
+       year = {1989},
+      pages = {417--424},
+  publisher = {IEEE, New York},
+topics={Speech},cat={C},
+}
+
+@INCOLLECTION{Grandvalet+Bengio-ssl-2006,
+     author = {Grandvalet, Yves and Bengio, Yoshua},
+     editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander},
+      title = {Entropy Regularization},
+  booktitle = {Semi-Supervised Learning},
+       year = {2006},
+      pages = {151--168},
+  publisher = {{MIT} Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/entropy_regularization_2006.pdf},
+   abstract = {The problem of semi-supervised induction consists in learning a decision rule from
+labeled and unlabeled data. This task can be undertaken by discriminative methods,
+provided that learning criteria are adapted consequently. In this chapter, we motivate the use of entropy regularization as a means to benefit from unlabeled data in
+the framework of maximum a posteriori estimation. The learning criterion is derived
+from clearly stated assumptions and can be applied to any smoothly parametrized
+model of posterior probabilities. The regularization scheme favors low density separation, without any modeling of the density of input features. The contribution
+of unlabeled data to the learning criterion induces local optima, but this problem
+can be alleviated by deterministic annealing. For well-behaved models of posterior
+probabilities, deterministic annealing {EM} provides a decomposition of the learning
+problem in a series of concave subproblems. Other approaches to the semi-supervised
+problem are shown to be close relatives or limiting cases of entropy regularization.
+A series of experiments illustrates the good behavior of the algorithm in terms of
+performance and robustness with respect to the violation of the postulated low density separation assumption. The minimum entropy solution benefits from unlabeled
+data and is able to challenge mixture models and manifold learning in a number of
+situations.},
+cat={B},topics={Unsupervised},
+}
+
+@INPROCEEDINGS{graves+eck+schmidhuber:bio-adit2004,
+     author = {Graves, A. and Eck, Douglas and Beringer, N. and Schmidhuber, Juergen},
+      title = {Biologically Plausible Speech Recognition with {LSTM} Neural Nets},
+  booktitle = {Proceedings of the First Int'l Workshop on Biologically Inspired Approaches to Advanced Information Technology (Bio-ADIT)},
+       year = {2004},
+      pages = {127-136},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2004_bioadit.pdf},
+   abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) are local in space and time and closely related to a biological model of memory in the prefrontal cortex. Not only are they more biologically plausible than previous artificial {RNN}s, they also outperformed them on many artificially generated sequential processing tasks. This encouraged us to apply {LSTM} to more realistic problems, such as the recognition of spoken digits. Without any modification of the underlying algorithm, we achieved results comparable to state-of-the-art Hidden {Markov} Model ({HMM}) based recognisers on both the {TIDIGITS} and TI46 speech corpora. We conclude that {LSTM} should be further investigated as a biologically plausible basis for a bottom-up, neural net-based approach to speech recognition.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@TECHREPORT{graves+eck+schmidhuber:tr-digits2003,
+       author = {Graves, A. and Eck, Douglas and Schmidhuber, Juergen},
+        title = {Comparing {LSTM} Recurrent Networks and Spiking Recurrent Networks on the Recognition of Spoken Digits},
+       number = {IDSIA-13-03},
+         year = {2003},
+  institution = {IDSIA},
+     abstract = {One advantage of spiking recurrent neural networks ({SNN}s) is an ability to categorise data using a synchrony-based latching mechnanism. This is particularly useful in problems where timewarping is encountered, such as speech recognition. Differentiable recurrent neural networks ({RNN}s) by contrast fail at tasks involving difficult timewarping, despite having sequence learning capabilities superior to {SNN}s. In this paper we demonstrate that Long Short-Term Memory ({LSTM}) is an {RNN} capable of robustly categorizing timewarped speech data, thus combining the most useful features of both paradigms. We compare its performance to {SNN}s on two variants of a spoken digit identification task, using data from an international competition. The first task (described in Nature (Nadis 2003)) required the categorisation of spoken digits with only a single training exemplar, and was specifically designed to test robustness to timewarping. Here {LSTM} performed better than all the {SNN}s in the competition. The second task was to predict spoken digits using a larger training set. Here {LSTM} greatly outperformed an {SNN}-like model found in the literature. These results suggest that {LSTM} has a place in domains that require the learning of large timewarped datasets, such as automatic speech recognition.},
+source={OwnPublication},
+sourcetype={TechReport},
+}
+
+@INPROCEEDINGS{haffner-98,
+     author = {Haffner, Patrick and Bottou, {L{\'{e}}on} and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann},
+      title = {Browsing through High Quality Document Images with {DjVu}},
+  booktitle = {Proc. of Advances in Digital Libraries 98},
+       year = {1998},
+      pages = {309--318},
+  publisher = {IEEE},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/haffner-98.ps.gz},
+topics={HighDimensional},cat={C},
+}
+
+@INPROCEEDINGS{Hamel+al-2009,
+     author = {Hamel, Philippe and Wood, Sean and Eck, Douglas},
+      title = {Automatic Identification of Instrument Classes in Polyphonic and Poly-Instrument Audio},
+  booktitle = {10th International Society for Music Information Retrieval Conference},
+       year = {2009},
+      pages = {399--404},
+        url = {http://ismir2009.ismir.net/proceedings/PS3-2.pdf},
+   abstract = {We present and compare several models for automatic identification of instrument classes in polyphonic and poly-instrument audio. The goal is to be able to identify which categories of instrument (Strings, Woodwind, Guitar, Piano, etc.) are present in a given audio example. We use a machine learning approach to solve this task. We constructed a system to generate a large database of musically relevant poly-instrument audio. Our database is generated from hundreds of instruments classified in 7 categories. Musical audio examples are generated by mixing multi-track MIDI files with thousands of instrument combinations. We compare three different classifiers : a Support Vector Machine ({SVM}), a Multilayer Perceptron (MLP) and a Deep Belief Network (DBN). We show that the DBN tends to outperform both the {SVM} and the MLP in most cases.}
+}
+
+@MISC{Hugo+al-snowbird-2007,
+        author = {Larochelle, Hugo and Bengio, Yoshua and Erhan, Dumitru},
+         title = {Generalization to a zero-data task: an empirical study},
+          year = {2007},
+  howpublished = {Talk and poster presented at the Learning Workshop(Snowbird), San Juan, Puerto Rico, 2007}
+}
+
+@INPROCEEDINGS{hyper:2000:ijcnn,
+     author = {Bengio, Yoshua},
+      title = {Continuous Optimization of Hyper-Parameters},
+  booktitle = {International Joint Conference on Neural Networks 2000},
+     volume = {I},
+       year = {2000},
+      pages = {305--310},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyper-ijcnn2000.pdf},
+   abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.},
+topics={ModelSelection},cat={C},
+}
+
+@INPROCEEDINGS{ICML01,
+     editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
+      title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
+  booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{ICML01-short,
+     editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj},
+      title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)},
+  booktitle = {ICML'01},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{ICML02,
+     editor = {Sammut, Claude and Hoffmann, Achim G.},
+      title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
+  booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{ICML02-short,
+     editor = {Sammut, Claude and Hoffmann, Achim G.},
+      title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)},
+  booktitle = {ICML'02},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{ICML03,
+     editor = {Fawcett, Tom and Mishra, Nina},
+      title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
+  booktitle = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
+       year = {-1},
+  publisher = {AAAI Press}
+}
+
+@INPROCEEDINGS{ICML03-short,
+     editor = {Fawcett, Tom and Mishra, Nina},
+      title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)},
+  booktitle = {ICML'03},
+       year = {-1},
+  publisher = {AAAI Press}
+}
+
+
+@INPROCEEDINGS{ICML04,
+     editor = {Brodley, Carla E.},
+      title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
+  booktitle = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
+       year = {-1},
+  publisher = {ACM}
+}
+
+@INPROCEEDINGS{ICML04-short,
+     editor = {Brodley, Carla E.},
+      title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)},
+  booktitle = {ICML'04},
+       year = {-1},
+  publisher = {ACM}
+}
+
+
+@INPROCEEDINGS{ICML05-short,
+     editor = {Raedt, Luc De and Wrobel, Stefan},
+      title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
+  booktitle = {ICML'05},
+       year = {-1},
+  publisher = {ACM}
+}
+
+
+@INPROCEEDINGS{ICML06-short,
+     editor = {Cohen, William W. and Moore, Andrew},
+      title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
+  booktitle = {ICML'06},
+       year = {-1},
+  publisher = {ACM}
+}
+
+
+@INPROCEEDINGS{ICML07-short,
+     editor = {Ghahramani, Zoubin},
+      title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
+  booktitle = {ICML'07},
+       year = {-1},
+  publisher = {ACM}
+}
+
+
+@INPROCEEDINGS{ICML08-short,
+     editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
+      title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
+  booktitle = {ICML'08},
+       year = {-1},
+  publisher = {ACM}
+}
+
+
+@INPROCEEDINGS{ICML09-short,
+     editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
+      title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
+  booktitle = {ICML'09},
+       year = {-1},
+  publisher = {ACM}
+}
+
+
+@INPROCEEDINGS{ICML96,
+     editor = {Saitta, L.},
+      title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
+  booktitle = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{ICML96-short,
+     editor = {Saitta, L.},
+      title = {Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)},
+  booktitle = {ICML'96},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{ICML97,
+     editor = {Fisher, Douglas H.},
+      title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
+  booktitle = {Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{ICML97-short,
+     editor = {Fisher, Douglas H.},
+      title = {{}Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)},
+  booktitle = {ICML'97},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{ICML98,
+     editor = {Shavlik, Jude W.},
+      title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
+  booktitle = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{ICML98-short,
+     editor = {Shavlik, Jude W.},
+      title = {Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)},
+  booktitle = {ICML'98},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{ICML99,
+     editor = {Bratko, Ivan and Dzeroski, Saso},
+      title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
+  booktitle = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{ICML99-short,
+     editor = {Bratko, Ivan and Dzeroski, Saso},
+      title = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)},
+  booktitle = {ICML'99},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INCOLLECTION{jaeger+eck:2007,
+     author = {Jaeger, H. and Eck, Douglas},
+      title = {Can't get you out of my head: {A} connectionist model of cyclic rehearsal},
+  booktitle = {Modeling Communications with Robots and Virtual Humans},
+     series = {{LNCS}},
+       year = {2007},
+  publisher = {Springer-Verlag},
+        url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_jaeger_eck.pdf},
+source={OwnPublication},
+sourcetype={Chapter},
+}
+
+@MISC{James+al-snowbird-2008,
+        author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
+         title = {Image Classification using Higher-Order Neural Models},
+          year = {2008},
+  howpublished = {The Learning Workshop (Snowbird, Utah)},
+           url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
+}
+
+@ARTICLE{JMLR-short,
+   journal = {JMLR},
+      year = {-1}
+}
+
+
+@INPROCEEDINGS{Kegl+Bertin+Eck-2008,
+     author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
+      title = {Metropolis-Hastings Sampling in a FilterBoost Music Classifier},
+  booktitle = {Music and machine learning workshop (ICML08)},
+       year = {2008}
+}
+
+@INPROCEEDINGS{kegl2005b,
+    author = {K{\'{e}}gl, Bal{\'{a}}zs},
+     title = {Generalization Error and Algorithmic Convergence of Median Boosting.},
+      year = {2005},
+  crossref = {NIPS17-shorter},
+  abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.}
+}
+
+@ARTICLE{lacoste+eck:eurasip,
+    author = {Lacoste, Alexandre and Eck, Douglas},
+     title = {A Supervised Classification Algorithm For Note Onset Detection},
+   journal = {EURASIP Journal on Applied Signal Processing},
+    volume = {2007},
+    number = {ID 43745},
+      year = {2007},
+     pages = {1--13},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@MASTERSTHESIS{Lajoie2009,
+    author = {Lajoie, Isabelle},
+  keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, auto-encodeur d{\'{e}}bruiteur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
+     title = {Apprentissage de repr{\'{e}}sentations sur-compl{\`{e}}tes par entra{\^{\i}}nement d’auto-encodeurs},
+      year = {2009},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {Les avanc{\'{e}}s dans le domaine de l’intelligence artificielle, permettent {\`{a}} des syst{\`{e}}mes
+informatiques de r{\'{e}}soudre des t{\^{a}}ches de plus en plus complexes li{\'{e}}es par exemple {\`{a}}
+la vision, {\`{a}} la compr{\'{e}}hension de signaux sonores ou au traitement de la langue. Parmi
+les mod{\`{e}}les existants, on retrouve les R{\'{e}}seaux de Neurones Artificiels (RNA), dont la
+popularit{\'{e}} a fait un grand bond en avant avec la d{\'{e}}couverte de Hinton et al. [22], soit
+l’utilisation de Machines de {Boltzmann} Restreintes (RBM) pour un pr{\'{e}}-entra{\^{\i}}nement
+non-supervis{\'{e}} couche apr{\`{e}}s couche, facilitant grandement l’entra{\^{\i}}nement supervis{\'{e}} du
+r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es (DBN), entra{\^{\i}}nement qui s’av{\'{e}}rait jusqu’alors tr{\`{e}}s
+difficile {\`{a}} r{\'{e}}ussir. Depuis cette d{\'{e}}couverte, des chercheurs ont {\'{e}}tudi{\'{e}} l’efficacit{\'{e}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement, telles que l’empilement d’auto-encodeurs traditionnels (SAE) [5, 38], et l’empilement d’auto-encodeur d{\'{e}}bruiteur (SDAE) [44].
+    C’est dans ce contexte qu’a d{\'{e}}but{\'{e}} la pr{\'{e}}sente {\'{e}}tude. Apr{\`{e}}s un bref passage en revue des notions de base du domaine de l’apprentissage machine et des m{\'{e}}thodes de
+pr{\'{e}}-entra{\^{\i}}nement employ{\'{e}}es jusqu’{\`{a}} pr{\'{e}}sent avec les modules RBM, AE et DAE, nous
+avons approfondi notre compr{\'{e}}hension du pr{\'{e}}-entra{\^{\i}}nement de type SDAE, explor{\'{e}} ses
+diff{\'{e}}rentes propri{\'{e}}t{\'{e}}s et {\'{e}}tudi{\'{e}} des variantes de SDAE comme strat{\'{e}}gie d’initialisation
+d’architecture profonde. Nous avons ainsi pu, entre autres choses, mettre en lumi{\`{e}}re
+l’influence du niveau de bruit, du nombre de couches et du nombre d’unit{\'{e}}s cach{\'{e}}es
+sur l’erreur de g{\'{e}}n{\'{e}}ralisation du SDAE. Nous avons constat{\'{e}} une am{\'{e}}lioration de la
+performance sur la t{\^{a}}che supervis{\'{e}}e avec l’utilisation des bruits poivre et sel (PS) et
+gaussien (GS), bruits s’av{\'{e}}rant mieux justifi{\'{e}}s que celui utilis{\'{e}} jusqu’{\`{a}} pr{\'{e}}sent, soit le
+masque {\`{a}} z{\'{e}}ro (MN). De plus, nous avons d{\'{e}}montr{\'{e}} que la performance profitait d’une
+emphase impos{\'{e}}e sur la reconstruction des donn{\'{e}}es corrompues durant l’entra{\^{\i}}nement
+des diff{\'{e}}rents DAE. Nos travaux ont aussi permis de r{\'{e}}v{\'{e}}ler que le DAE {\'{e}}tait en mesure d’apprendre, sur des images naturelles, des filtres semblables {\`{a}} ceux retrouv{\'{e}}s dans
+les cellules V1 du cortex visuel, soit des filtres d{\'{e}}tecteurs de bordures. Nous aurons par
+ailleurs pu montrer que les repr{\'{e}}sentations apprises du SDAE, compos{\'{e}}es des caract{\'{e}}ristiques ainsi extraites, s’av{\'{e}}raient fort utiles {\`{a}} l’apprentissage d’une machine {\`{a}} vecteurs de
+support ({SVM}) lin{\'{e}}aire ou {\`{a}} noyau gaussien, am{\'{e}}liorant grandement sa performance de
+g{\'{e}}n{\'{e}}ralisation. Aussi, nous aurons observ{\'{e}} que similairement au DBN, et contrairement
+au SAE, le SDAE poss{\'{e}}dait une bonne capacit{\'{e}} en tant que mod{\`{e}}le g{\'{e}}n{\'{e}}rateur. Nous
+avons {\'{e}}galement ouvert la porte {\`{a}} de nouvelles strat{\'{e}}gies de pr{\'{e}}-entra{\^{\i}}nement et d{\'{e}}couvert le potentiel de l’une d’entre elles, soit l’empilement d’auto-encodeurs rebruiteurs
+(SRAE).}
+}
+
+@INPROCEEDINGS{lamere+eck:ismir2007,
+     author = {Lamere, Paul and Eck, Douglas},
+     editor = {},
+      title = {Using 3D Visualizations to Explore and Discover Music},
+  booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}},
+       year = {2007},
+  publisher = {},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@ARTICLE{Larochelle+al-2010,
+    author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph},
+     title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest},
+   journal = {Neural Computation},
+      year = {2010},
+      note = {To appear}
+}
+
+@INPROCEEDINGS{Larochelle+Bengio-2008,
+    author = {Larochelle, Hugo and Bengio, Yoshua},
+     title = {Classification using Discriminative Restricted {B}oltzmann Machines},
+      year = {2008},
+     pages = {536--543},
+  crossref = {ICML08-shorter},
+  abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
+for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
+this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
+RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
+approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
+}
+
+@INPROCEEDINGS{Larochelle-2009,
+     author = {Larochelle, Hugo and Erhan, Dumitru and Vincent, Pascal},
+      title = {Deep Learning using Robust Interdependent Codes},
+  booktitle = {Proceedings of the  Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
+       year = {2009},
+      pages = {312--319},
+date = "April 16-18, 2009",
+}
+
+@ARTICLE{Larochelle-jmlr-2009,
+    author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
+     title = {Exploring Strategies for Training Deep Neural Networks},
+    volume = {10},
+      year = {2009},
+     pages = {1--40},
+  crossref = {JMLR-shorter},
+  abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
+}
+
+@PHDTHESIS{Larochelle-PhD-2009,
+    author = {Larochelle, Hugo},
+  keywords = {apprentissage non-supervis{\'{e}}, architecture profonde, autoassociateur, autoencodeur, machine de {Boltzmann} restreinte, r{\'{e}}seau de neurones artificiel},
+     title = {{\'{E}}tude de techniques d'apprentissage non-supervis{\'{e}} pour l'am{\'{e}}lioration de l'entra{\^{\i}}nement supervis{\'{e}} de mod{\`{e}}les connexionnistes},
+      year = {2009},
+    school = {University of Montr{\'{e}}al},
+  abstract = {Le domaine de l'intelligence artificielle a pour objectif le d{\'{e}}veloppement de syst{\`{e}}mes informatiques capables de simuler des comportements normalement associ{\'{e}}s {\`{a}} l'intelligence humaine. On aimerait entre autres pouvoir construire une machine qui puisse
+r{\'{e}}soudre des t{\^{a}}ches li{\'{e}}es {\`{a}} la vision (e.g., la reconnaissance d'objet), au traitement de la langue (e.g., l'identification du sujet d'un texte) ou au traitement de signaux sonores (e.g., la reconnaissance de la parole).
+     Une approche d{\'{e}}velopp{\'{e}}e afin de r{\'{e}}soudre ce genre de t{\^{a}}ches est bas{\'{e}}e sur l'apprentissage automatique de mod{\`{e}}les {\`{a}} partir de donn{\'{e}}es {\'{e}}tiquet{\'{e}}es refl{\'{e}}tant le comportement intelligent {\`{a}} {\'{e}}muler. Entre autre, il a {\'{e}}t{\'{e}} propos{\'{e}} de mod{\'{e}}liser le calcul n{\'{e}}cessaire {\`{a}} la
+r{\'{e}}solution d'une t{\^{a}}che {\`{a}} l'aide d'un r{\'{e}}seau de neurones artificiel, dont il est possible d'adapter le comportement {\`{a}} l'aide de la r{\'{e}}tropropagation [99, 131] d'un gradient informatif sur les erreurs commises par le r{\'{e}}seau. Populaire durant les ann{\'{e}}es 80, cette
+approche sp{\'{e}}cifique a depuis perdu partiellement de son attrait, suite au d{\'{e}}veloppement des m{\'{e}}thodes {\`{a}} noyau. Celles-ci sont souvent plus stables, plus faciles {\`{a}} utiliser et leur performance est souvent au moins aussi {\'{e}}lev{\'{e}}e pour une vaste gamme de probl{\`{e}}mes.
+     Les m{\'{e}}thodes d'apprentissage automatique ont donc progress{\'{e}} dans leur fonctionnement, mais aussi dans la complexit{\'{e}} des probl{\`{e}}mes auxquels elles se sont attaqu{\'{e}}. Ainsi, plus r{\'{e}}cemment, des travaux [12, 15] ont commenc{\'{e}} {\`{a}} {\'{e}}mettre des doutes sur la capacit{\'{e}} des machines {\`{a}} noyau {\`{a}} pouvoir efficacement r{\'{e}}soudre des probl{\`{e}}mes de la complexit{\'{e}} requise par l'intelligence artificielle. Parall{\`{e}}lement, Hinton et al. [81] faisaient une perc{\'{e}}e dans l'apprentissage automatique de r{\'{e}}seaux de neurones, en proposant une proc{\'{e}}dure permettant l'entra{\^{\i}}nement de r{\'{e}}seaux de neurones d'une plus grande complexit{\'{e}} (i.e., avec plus de couches de neurones cach{\'{e}}es) qu'il n'{\'{e}}tait possible auparavant.
+     C'est dans ce contexte qu'ont {\'{e}}t{\'{e}} conduits les travaux de cette th{\`{e}}se. Cette th{\`{e}}se d{\'{e}}bute par une exposition des principes de base de l'apprentissage automatique (chapitre 1) et une discussion des obstacles {\`{a}} l'obtention d'un mod{\`{e}}le ayant une bonne performance
+de g{\'{e}}n{\'{e}}ralisation (chapitre 2). Puis, sont pr{\'{e}}sent{\'{e}}es les contributions apport{\'{e}}es dans le cadre de cinq articles, contributions qui sont toutes bas{\'{e}}es sur l'utilisation d'une certaine
+forme d'apprentissage non-supervis{\'{e}}.
+    Le premier article (chapitre 4) propose une m{\'{e}}thode d'entra{\^{\i}}nement pour un type sp{\'{e}}cifique de r{\'{e}}seau {\`{a}} une seule couche cach{\'{e}}e (la machine de {Boltzmann} restreinte) bas{\'{e}}e sur une combinaison des apprentissages supervis{\'{e}} et non-supervis{\'{e}}. Cette m{\'{e}}thode permet d'obtenir une meilleure performance de g{\'{e}}n{\'{e}}ralisation qu'un r{\'{e}}seau de neurones standard ou qu'une machine {\`{a}} vecteurs de support {\`{a}} noyau, et met en {\'{e}}vidence de fa{\c c}on
+explicite les b{\'{e}}n{\'{e}}fices qu'apporte l'apprentissage non-supervis{\'{e}} {\`{a}} l'entra{\^{\i}}nement d'un r{\'{e}}seau de neurones.
+    Ensuite, dans le second article (chapitre 6), on {\'{e}}tudie et {\'{e}}tend la proc{\'{e}}dure d'entra{\^{\i}}nement propos{\'{e}}e par Hinton et al. [81]. Plus sp{\'{e}}cifiquement, on y propose une approche diff{\'{e}}rente mais plus flexible pour initialiser un r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es, bas{\'{e}}e sur un r{\'{e}}seau autoassociateur. On y explore aussi l'impact du nombre de couches et de neurones par couche sur la performance d'un r{\'{e}}seau et on y d{\'{e}}crit diff{\'{e}}rentes variantes mieux adapt{\'{e}}es {\`{a}} l'apprentissage en ligne ou pour donn{\'{e}}es {\`{a}} valeurs continues.
+    Dans le troisi{\`{e}}me article (chapitre 8), on explore plut{\^{o}}t la performance de r{\'{e}}seaux profonds sur plusieurs probl{\`{e}}mes de classification diff{\'{e}}rents. Les probl{\`{e}}mes choisis ont la propri{\'{e}}t{\'{e}} d'avoir {\'{e}}t{\'{e}} g{\'{e}}n{\'{e}}r{\'{e}}s {\`{a}} partir de plusieurs facteurs de variation. Cette propri{\'{e}}t{\'{e}}, qui caract{\'{e}}rise les probl{\`{e}}mes li{\'{e}}s {\`{a}} l'intelligence artificielle, pose difficult{\'{e}} aux machines {\`{a}} noyau, tel que confirm{\'{e}} par les exp{\'{e}}riences de cet article.
+    Le quatri{\`{e}}me article (chapitre 10) pr{\'{e}}sente une am{\'{e}}lioration de l'approche bas{\'{e}}e sur les r{\'{e}}seaux autoassociateurs. Cette am{\'{e}}lioration applique une modification simple {\`{a}} la proc{\'{e}}dure d'entra{\^{\i}}nement d'un r{\'{e}}seau autoassociateur, en « bruitant » les entr{\'{e}}es du r{\'{e}}seau afin que celui-ci soit forc{\'{e}} {\`{a}} la d{\'{e}}bruiter.
+    Le cinqui{\`{e}}me et dernier article (chapitre 12) apporte une autre am{\'{e}}lioration aux r{\'{e}}seaux autoassociateurs, en permettant des interactions d'inhibition ou d'excitation entre les neurones cach{\'{e}}s de ces r{\'{e}}seaux. On y d{\'{e}}montre que de telles interactions peuvent
+{\^{e}}tre apprises et sont b{\'{e}}n{\'{e}}fiques {\`{a}} la performance d'un r{\'{e}}seau profond.}
+}
+
+@INPROCEEDINGS{Larochelle2008,
+     author = {Larochelle, Hugo and Erhan, Dumitru and Bengio, Yoshua},
+      title = {Zero-data Learning of New Tasks},
+  booktitle = {AAAI Conference on Artificial Intelligence},
+       year = {2008},
+        url = {http://www-etud.iro.umontreal.ca/~larocheh/publications/aaai2008_zero-data.pdf},
+   abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
+for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
+this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
+RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
+approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
+}
+
+@INPROCEEDINGS{LarochelleH2007,
+    author = {Larochelle, Hugo and Erhan, Dumitru and Courville, Aaron and Bergstra, James and Bengio, Yoshua},
+     title = {An Empirical Evaluation of Deep Architectures on Problems with Many Factors of Variation},
+      year = {2007},
+     pages = {473--480},
+  crossref = {ICML07-shorter},
+  abstract = {Recently, several learning algorithms relying on models with deep architectures have been proposed. Though they have demonstrated impressive performance, to date, they have only been evaluated on relatively simple problems such as digit  recognition in a controlled environment, for which many machine learning algorithms already report reasonable results. Here, we present a series of experiments which indicate that these models show promise in solving harder learning problems that exhibit many factors of variation. These models are compared with well-established algorithms such as Support Vector Machines and single hidden-layer feed-forward neural networks.}
+}
+
+@MASTERSTHESIS{Latendresse-MSc,
+    author = {Latendresse, Simon},
+     title = {L'utilisation d'hyper-param{\`{e}}tres pour la selection de variables},
+      year = {1999},
+    school = {Universit{\'{e}} de Montreal, Dept. IRO},
+      note = {(in French)}
+}
+
+@MASTERSTHESIS{Lauzon99,
+    author = {Lauzon, Vincent-Philippe},
+     title = {Mod{\'{e}}les statistiques comme algorithmes d'apprentissage et {MMCC}s; pr{\'{e}}diction de s{\'{e}}ries financi{\`{e}}res},
+      year = {1999},
+    school = {D{\'{e}}epartement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+  crossref = {DIRO}
+}
+
+@INPROCEEDINGS{lecun-93,
+     author = {{LeCun}, Yann and Bengio, Yoshua and Henderson, Donnie and Weisbuch, A. and Weissman, H. and L., Jackel},
+      title = {On-line handwriting recognition with neural networks: spatial representation versus temporal representation.},
+  booktitle = {Proc. International Conference on handwriting and drawing.},
+       year = {1993},
+  publisher = {Ecole Nationale Superieure des Telecommunications},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-93.ps.gz},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@INPROCEEDINGS{lecun-99,
+     author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
+     editor = {Forsyth, D.},
+      title = {Object Recognition with Gradient-Based Learning},
+  booktitle = {Shape, Contour and Grouping in Computer Vision},
+       year = {1999},
+      pages = {319-345},
+  publisher = {Springer},
+        url = {orig/lecun-99.ps.gz},
+topics={PriorKnowledge,Speech},cat={B},
+}
+
+@TECHREPORT{lecun-99b,
+       author = {{LeCun}, Yann and Haffner, Patrick and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
+        title = {Gradient-Based Learning for Object Detection, Segmentation and Recognition},
+         year = {1999},
+  institution = {AT\&T Labs},
+          url = {orig/lecun-99b.ps.gz},
+topics={Speech},cat={T},
+}
+
+@INPROCEEDINGS{lecun-bengio-94,
+     author = {{LeCun}, Yann and Bengio, Yoshua},
+      title = {Word-level training of a handwritten word recognizer based on convolutional neural networks},
+  booktitle = {Proc. of the International Conference on Pattern Recognition},
+     volume = {II},
+       year = {1994},
+      pages = {88--92},
+  publisher = {IEEE},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-word.ps},
+   abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. Words are represented by low resolution “annotated images” where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.},
+topics={Speech},cat={C},
+}
+
+@INPROCEEDINGS{lecun-bengio-95a,
+     author = {{LeCun}, Yann and Bengio, Yoshua},
+     editor = {Arbib, M. A.},
+      title = {Convolutional Networks for Images, Speech, and Time-Series},
+  booktitle = {The Handbook of Brain Theory and Neural Networks},
+       year = {1995},
+      pages = {255--257},
+  publisher = {MIT Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-convo.pdf},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@INCOLLECTION{lecun-bengio-95b,
+     author = {{LeCun}, Yann and Bengio, Yoshua},
+     editor = {Arbib, M. A.},
+      title = {Pattern Recognition and Neural Networks},
+  booktitle = {The Handbook of Brain Theory and Neural Networks},
+       year = {1995},
+      pages = {711--714},
+  publisher = {MIT Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/handbook-patrec.pdf},
+topics={PriorKnowledge,Speech},cat={B},
+}
+
+@ARTICLE{LeCun98,
+    author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick},
+     title = {Gradient-Based Learning Applied to Document Recognition},
+   journal = {Proceedings of the IEEE},
+    volume = {86},
+    number = {11},
+      year = {1998},
+     pages = {2278--2324},
+  abstract = {Multilayer Neural Networks trained with the backpropagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. 
+Real-life document recognition systems are composed or multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to minimize an overall performance measure. 
+Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks. 
+A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with global training techniques to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.},
+topics={PriorKnowledge,Speech},cat={C},
+}
+
+@INPROCEEDINGS{Lecun_icassp97,
+     author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua},
+      title = {Reading Checks with graph transformer networks},
+  booktitle = {International Conference on Acoustics, Speech and Signal Processing},
+     volume = {1},
+       year = {1997},
+      pages = {151--154},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-bottou-bengio-97.ps.gz},
+topics={Speech},cat={C},
+}
+
+@ARTICLE{LeRoux+Bengio-2010,
+    author = {Le Roux, Nicolas and Bengio, Yoshua},
+     title = {Deep Belief Networks are Compact Universal Approximators},
+   journal = {Neural Computation},
+      year = {2010},
+      note = {To appear}
+}
+
+@TECHREPORT{LeRoux-Bengio-2007-TR,
+       author = {Le Roux, Nicolas and Bengio, Yoshua},
+        title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
+       number = {1294},
+         year = {2007},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {Deep Belief Networks (DBN) are generative neural network models with
+many layers of hidden explanatory factors, recently introduced by Hinton et al.,
+along with a greedy layer-wise unsupervised learning algorithm. The building
+block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine
+(RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines
+are interesting because inference is easy in them, and because they have been
+successfully used as building blocks for training deeper models.  
+We first prove that adding hidden units yields strictly improved modeling
+power, while a second theorem shows that RBMs are universal approximators of
+discrete distributions.  We then study the question of whether DBNs with more
+layers are strictly more powerful in terms of representational power. This
+suggests a new and less greedy criterion for training RBMs within DBNs.}
+}
+
+@ARTICLE{LeRoux-Bengio-2008,
+    author = {Le Roux, Nicolas and Bengio, Yoshua},
+     title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
+   journal = {Neural Computation},
+    volume = {20},
+    number = {6},
+      year = {2008},
+     pages = {1631--1649},
+  abstract = {Deep Belief Networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton et al., along with a greedy layer-wise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a Restricted {Boltzmann} Machine (RBM), used to represent one layer of the model. Restricted {Boltzmann} Machines are interesting because inference is easy in them, and because they have been successfully used as building blocks for training deeper models.  We first prove that adding hidden units yields strictly improved modelling power, while a second theorem shows that RBMs are universal approximators of discrete distributions.  We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}
+}
+
+@INPROCEEDINGS{LeRoux-continuous,
+     author = {Le Roux, Nicolas and Bengio, Yoshua},
+      title = {Continuous Neural Networks},
+  booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)},
+       year = {2007},
+  publisher = {Omnipress},
+   abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based learning. While having the same number of parameters as an ordinary neural network, its internal structure suggests that it can represent some smooth functions much more compactly. Under mild assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this parametrization may help reducing the problem of saturation of the neurons. In a second approach, the input-to-hidden weights arefully non-parametric, yielding a kernel machine for which we demonstrate a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free and still generalizes in spite of an absence of explicit regularization.}
+}
+
+@PHDTHESIS{LeRoux-PhD-2008,
+    author = {Le Roux, Nicolas},
+     title = {Avanc{\'{e}}es th{\'{e}}oriques sur la repr{\'{e}}sentation et l'optimisation des r{\'{e}}seaux de neurones},
+      year = {2008},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {Les r{\'{e}}seaux de neurones artificiels ont {\'{e}}t{\'{e}} abondamment utilis{\'{e}}s dans la communaut{\'{e}} de l'apprentissage machine depuis les ann{\'{e}}es 80. Bien qu'ils aient {\'{e}}t{\'{e}} {\'{e}}tudi{\'{e}}s pour la premi{\`{e}}re fois il y a cinquante ans par Rosenblatt [68], ils ne furent r{\'{e}}ellement populaires qu'apr{\`{e}}s l'apparition de la r{\'{e}}tropropagation du gradient, en 1986 [71].
+En 1989, il a {\'{e}}t{\'{e}} prouv{\'{e}} [44] qu'une classe sp{\'{e}}cifique de r{\'{e}}seaux de neurones (les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e) {\'{e}}tait suffisamment puissante pour pouvoir approximer presque n'importe quelle fonction avec une pr{\'{e}}cision arbitraire : le th{\'{e}}or{\`{e}}me d'approximation universelle. Toutefois, bien que ce th{\'{e}}or{\`{e}}me e{\^{u}}t pour cons{\'{e}}quence un int{\'{e}}r{\^{e}}t accru pour les r{\'{e}}seaux de neurones, il semblerait qu'aucun effort n'ait {\'{e}}t{\'{e}} fait pour profiter de cette propri{\'{e}}t{\'{e}}.
+En outre, l'optimisation des r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e n'est pas convexe. Cela a d{\'{e}}tourn{\'{e}} une grande partie de la communaut{\'{e}} vers d'autres algorithmes, comme par exemple les machines {\`{a}} noyau (machines {\`{a}} vecteurs de support et r{\'{e}}gression
+{\`{a}} noyau, entre autres).
+La premi{\`{e}}re partie de cette th{\`{e}}se pr{\'{e}}sentera les concepts d'apprentissage machine g{\'{e}}n{\'{e}}raux n{\'{e}}cessaires {\`{a}} la compr{\'{e}}hension des algorithmes utilis{\'{e}}s. La deuxi{\`{e}}me partie se focalisera plus sp{\'{e}}cifiquement sur les m{\'{e}}thodes {\`{a}} noyau et les r{\'{e}}seaux de neurones. La troisi{\`{e}}me partie de ce travail visera ensuite {\`{a}} {\'{e}}tudier les limitations des machines {\`{a}} noyaux et {\`{a}} comprendre les raisons pour lesquelles elles sont inadapt{\'{e}}es {\`{a}} certains probl{\`{e}}mes que nous avons {\`{a}} traiter.
+La quatri{\`{e}}me partie pr{\'{e}}sente une technique permettant d'optimiser les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e de mani{\`{e}}re convexe. Bien que cette technique s'av{\`{e}}re difficilement exploitable pour des probl{\`{e}}mes de grande taille, une version approch{\'{e}}e permet d'obtenir une bonne solution dans un temps raisonnable.
+La cinqui{\`{e}}me partie se concentre sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e infinie. Cela leur permet th{\'{e}}oriquement d'exploiter la propri{\'{e}}t{\'{e}} d'approximation universelle et ainsi d'approcher facilement une plus grande classe de fonctions.
+Toutefois, si ces deux variations sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e leur conf{\`{e}}rent des propri{\'{e}}t{\'{e}}s int{\'{e}}ressantes, ces derniers ne peuvent extraire plus que des concepts de bas niveau. Les m{\'{e}}thodes {\`{a}} noyau souffrant des m{\^{e}}mes limites, aucun de
+ces deux types d'algorithmes ne peut appr{\'{e}}hender des probl{\`{e}}mes faisant appel {\`{a}} l'apprentissage de concepts de haut niveau.
+R{\'{e}}cemment sont apparus les Deep Belief Networks [39] qui sont des r{\'{e}}seaux de neurones {\`{a}} plusieurs couches cach{\'{e}}es entra{\^{\i}}n{\'{e}}s de mani{\`{e}}re efficace. Cette profondeur leur permet d'extraire des concepts de haut niveau et donc de r{\'{e}}aliser des t{\^{a}}ches hors
+de port{\'{e}}e des algorithmes conventionnels. La sixi{\`{e}}me partie {\'{e}}tudie des propri{\'{e}}t{\'{e}}s de ces r{\'{e}}seaux profonds.
+Les probl{\`{e}}mes que l'on rencontre actuellement n{\'{e}}cessitent non seulement des algorithmes capables d'extraire des concepts de haut niveau, mais {\'{e}}galement des m{\'{e}}thodes d'optimisation capables de traiter l'immense quantit{\'{e}} de donn{\'{e}}es parfois disponibles, si possible en temps r{\'{e}}el. La septi{\`{e}}me partie est donc la pr{\'{e}}sentation d'une nouvelle technique permettant une optimisation plus rapide.}
+}
+
+@ARTICLE{lheureux-04,
+    author = {{L'Heureux}, Pierre-Jean and Carreau, Julie and Bengio, Yoshua and Delalleau, Olivier and Yue, Shi Yi},
+     title = {Locally Linear Embedding for dimensionality reduction in {QSAR}},
+   journal = {Journal of Computer-Aided Molecular Design},
+    volume = {18},
+      year = {2004},
+     pages = {475--482},
+  abstract = {Current practice in Quantitative Structure Activity Relationship (QSAR) methods usually involves generating a great number of chemical descriptors and then cutting them back with variable selection techniques. Variable selection is an effective method to reduce the dimensionality but may discard some valuable information. This paper introduces Locally Linear Embedding ({LLE}), a local non-linear dimensionality reduction technique, that can statistically discover a low-dimensional representation of the chemical data. {LLE} is shown to create more stable representations than other non-linear dimensionality
+reduction algorithms, and to be capable of capturing non-linearity in chemical data.},
+topics={Bioinformatic},cat={J},
+}
+
+@TECHREPORT{lm-TR00,
+       author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
+        title = {A Neural Probabilistic Language Model},
+       number = {1178},
+         year = {2000},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1178.pdf},
+     abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made or words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model, and that the proposed approach allows to take advantage of much longer contexts.},
+topics={Markov,Unsupervised,Language},cat={T},
+}
+
+@INPROCEEDINGS{Maillet+al-2009,
+     author = {Maillet, Fran{\c c}ois and Eck, Douglas and Desjardins, Guillaume and Lamere, Paul},
+      title = {Steerable Playlist Generation by Learning Song Similarity from Radio Station Playlists},
+  booktitle = {Proceedings of the 10th International Conference on Music Information Retrieval},
+       year = {2009},
+        url = {http://www-etud.iro.umontreal.ca/~mailletf/papers/ismir09-playlist.pdf},
+   abstract = {This paper presents an approach to generating steerable playlists. We first demonstrate a method for learning song transition probabilities from audio features extracted from songs played in professional radio station playlists. We then show that by using this learnt similarity function as a prior, we are able to generate steerable playlists by choosing the next song to play not simply based on that prior, but on a tag cloud that the user is able to manipulate to express the high-level characteristics of the music he wishes  Last.fm, to listen to.}
+}
+
+@INPROCEEDINGS{manzagol+bertinmahieux+eck:ismir2008,
+     author = {Manzagol, Pierre-Antoine and Bertin-Mahieux, Thierry and Eck, Douglas},
+      title = {On the Use of Sparse Time-Relative Auditory Codes for Music},
+  booktitle = {{Proceedings of the 9th International Conference on Music Information Retrieval ({ISMIR} 2008)}},
+       year = {2008},
+   abstract = {Many if not most audio features used in MIR research are inspired by work done in speech recognition and are variations on the spectrogram. Recently, much attention has been given to new representations of audio that are sparse and time-relative. These representations are efficient and able to avoid the time-frequency trade-off of a spectrogram. Yet little work with music streams has been conducted and these features remain mostly unused in the MIR community. In this paper we further explore the use of these features for musical signals. In particular, we investigate their use on realistic music examples (i.e. released commercial music) and their use as input features for supervised learning. Furthermore, we identify three specific issues related to these features which will need to be further addressed in order to obtain the full benefit for MIR applications.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@MASTERSTHESIS{Manzagol-Msc-2007,
+    author = {Manzagol, Pierre-Antoine},
+       key = {Algorithme d'apprentissage, méthode de second ordre, gradient naturel, approximation stochastique},
+     title = {TONGA - Un algorithme de gradient naturel pour les probl{\`{e}}mes de grande taille},
+      year = {2007},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {Les syst{\`{e}}mes adaptatifs sont confront{\'{e}}s {\`{a}} des donn{\'{e}}es qui {\'{e}}voluent rapidement en quantit{\'{e}} et en complexit{\'{e}}. Les avanc{\'{e}}es mat{\'{e}}rielles de l'informatique ne susent pas {\`{a}} compenser cet essor. Une mise {\`{a}} l'{\'{e}}chelle des techniques d'apprentissage est n{\'{e}}cessaire. D'une part, les mod{\`{e}}les doivent gagner en capacit{\'{e}} de repr{\'{e}}sentation. De l'autre, les algorithmes d'apprentissage doivent devenir plus ecaces.
+    Nos travaux se situent dans ce contexte des probl{\`{e}}mes de grande taille et portent sur l'am{\'{e}}lioration des algorithmes d'apprentissage. Deux {\'{e}}l{\'{e}}ments de r{\'{e}}ponse sont d{\'{e}}j{\`{a}} connus. Il s'agit des m{\'{e}}thodes de second ordre et de l'approximation stochastique. Or, les m{\'{e}}thodes de second ordre poss{\`{e}}dent des complexit{\'{e}}s en calculs et en m{\'{e}}moire qui sont prohibitives dans le cadre des probl{\`{e}}mes de grande taille. {\'{E}}galement, il est notoirement dicile de concilier ces m{\'{e}}thodes avec l'approximation stochastique. TONGA est un algorithme d'apprentissage con{\c c}u pour faire face {\`{a}} ces dicult{\'{e}}s. Il s'agit d'une implantation stochastique et adapt{\'{e}}e aux probl{\`{e}}mes de grande taille d'une m{\'{e}}thode de second ordre, le gradient naturel. Dans ce m{\'{e}}moire, nous examinons de pr{\`{e}}s ce nouvel algorithme d'apprentissage en le comparant sur plusieurs probl{\`{e}}mes au gradient stochastique, la technique d'optimisation commun{\'{e}}ment utilis{\'{e}}e dans le cadre des probl{\`{e}}mes de grande taille. Nos exp{\'{e}}riences montrent que TONGA est au moins tout aussi ecace que le gradient stochastique, ce qui est un accomplissement en soit. Dans certains cas, TONGA offre une convergence nettement sup{\'{e}}rieure {\`{a}} celle du gradient stochastique.}
+}
+
+@INPROCEEDINGS{matic-94,
+     author = {Matic, N. and Henderson, Donnie and {LeCun}, Yann and Bengio, Yoshua},
+      title = {Pen-based visitor registration system (PENGUIN)},
+  booktitle = {Conference Record of the Twenty-Eighth Asilomar Conference on Signals, Systems and Computers},
+       year = {1994},
+  publisher = {IEEE},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/matic-94.tiff},
+   abstract = {We describe a new electronic pen-based visitors registration system (PENGUIN) whose goal is to expand and modernize the visitor sign-in procedure at Bell Laboratories. The system uses a pen-interface (i.e. tablet-display) in what is essentially a form filling application. Our pen-interface is coupled with a powerful and accurate on-line handwriting recognition module. A database of AT&T employees (the visitors' hosts) and country names is used to check the recognition module outputs, in order to find the best match. The system provides assistance to the guard at one of the guard stations in routing visitors to their hosts. All the entered data are stored electronically. Initial testing shows that PENGUIN system performs reliably and with high accuracy. It retrieves the correct host name with 97\% accuracy and the correct visitors citizenship with 99\% accuracy. The system is robust and easy to use for both visitors and guards},
+topics={Speech},cat={C},
+}
+
+@UNPUBLISHED{mirex2005artist,
+    author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
+     title = {Artist Recognition: A Timbre- and Rhythm-Based Multiresolution Approach},
+      year = {2005},
+      note = {{MIREX} artist recognition contest},
+source={OwnPublication},
+sourcetype={Other},
+}
+
+@UNPUBLISHED{mirex2005genre,
+    author = {Bergstra, James and Casagrande, Norman and Eck, Douglas},
+     title = {Genre Classification: Timbre- and Rhythm-Based Multiresolution Audio Classification},
+      year = {2005},
+      note = {{MIREX} genre classification contest},
+source={OwnPublication},
+sourcetype={Other},
+}
+
+@UNPUBLISHED{mirex2005note,
+    author = {Lacoste, Alexandre and Eck, Douglas},
+     title = {Onset Detection with Artificial Neural Networks},
+      year = {2005},
+      note = {{MIREX} note onset detection contest},
+source={OwnPublication},
+sourcetype={Other},
+}
+
+@UNPUBLISHED{mirex2005tempo,
+    author = {Eck, Douglas and Casagrande, Norman},
+     title = {A Tempo-Extraction Algorithm Using an Autocorrelation Phase Matrix and Shannon Entropy},
+      year = {2005},
+      note = {{MIREX} tempo extraction contest (www.music-ir.org/\-evaluation/\-mirex-results)},
+source={OwnPublication},
+sourcetype={Other},
+}
+
+@INPROCEEDINGS{mitacs-insurance01,
+     author = {Bengio, Yoshua and Chapados, Nicolas and Dugas, Charles and Ghosn, Joumana and Takeuchi, Ichiro and Vincent, Pascal},
+      title = {High-Dimensional Data Inference for Automobile Insurance Premia Estimation},
+  booktitle = {Presented at the 2001 MITACS Annual Meeting},
+       year = {2001},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mitacs_insurance.ps},
+topics={HighDimensional,Mining},cat={C},
+}
+
+@INPROCEEDINGS{Morin+al-2005,
+     author = {Morin, Frederic and Bengio, Yoshua},
+     editor = {Cowell, Robert G. and Ghahramani, Zoubin},
+      title = {Hierarchical Probabilistic Neural Network Language Model},
+  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
+       year = {2005},
+      pages = {246--252},
+  publisher = {Society for Artificial Intelligence and Statistics},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf},
+   abstract = {In recent years, variants of a neural network architecture for statistical language modeling have been proposed and successfully applied, e.g. in the language modeling component of speech recognizers. The main advantage of these architectures is that they learn an embedding for words (or other symbols) in a continuous space that helps to smooth the language model and provide good generalization even when the number of training examples is insufficient. However, these models are extremely slow in comparison to the more commonly used n-gram models, both for training and recognition. As an alternative to an importance sampling method proposed to speed-up training, we introduce a hierarchical decomposition of the conditional probabilities that yields a speed-up of about 200 both during training and recognition. The hierarchical decomposition is a binary hierarchical clustering constrained by the prior knowledge extracted from the WordNet semantic hierarchy.},
+topics={Language},cat={C},
+}
+
+@TECHREPORT{Nadeau-inference-TR99,
+       author = {Nadeau, Claude and Bengio, Yoshua},
+        title = {Inference and the Generalization Error},
+       number = {99s-45},
+         year = {1999},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/techrep.pdf},
+     abstract = {We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets and test examples. This allows us to propose two new estimators of this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered in (Dietterich, 1998). In particular, tests of hypothesis based on these don’t tend to be too liberal like other tests currently available, and have good power.},
+topics={Comparative},cat={T},
+}
+
+@INPROCEEDINGS{nadeau:2000:nips,
+    author = {Nadeau, Claude and Bengio, Yoshua},
+     title = {Inference for the Generalization Error},
+      year = {2000},
+     pages = {307--313},
+  crossref = {NIPS12-shorter},
+  abstract = {In order to to compare learning algorithms, experimental results reported in the machine learning litterature often use statistical tests of significance. Unfortunately, most of these tests do not take into account the variability due to the choice of training set. We perform a theoretical investigation of the variance of the cross-validation estimate of the generalization error that takes into account the variability due to the choice of training sets. This allows us to propose two new ways to estimate this variance. We show, via simulations, that these new statistics perform well relative to the statistics considered by Dietterich (Dietterich, 1998).},
+topics={Comparative},cat={C},
+}
+
+@ARTICLE{nadeau:2001,
+    author = {Nadeau, Claude and Bengio, Yoshua},
+     title = {Inference for the Generalization Error},
+   journal = {Machine Learning},
+      year = {2001},
+  abstract = {In order to compare learning algorithms, experimental results reported in the machine learning literature often use statistical tests of significance to support the claim that a new learning algorithm generalizes better. Such tests should take into account the variability due to the choice of training set and not only that due to the test examples, as is often the case. This could lead to gross underestimation of the variance of the cross-validation estimator, and to the wrong conclusion that the new algorithm is significantly better when it is not. We perform a theoretical investigation of the variance of a cross-validation estimator of the generalization error that takes into account the variability due to the randomness of the training set as well as test examples. Our analysis shows that all the variance estimators that are based only on the results of the cross-validation experiment must be biased. This analysis allows us to propose new estimators of this variance. We show, via simulations, that tests of hypothesis about the generalization error using those new variance estimators have better properties than tests involving variance estimators currently in use and listed in (Dietterich, 1998). In particular, the new tests have correct size and good power. That is, the new tests do not reject the null hypothesis too often when the hypothesis is true, but they tend to frequently reject the null hypothesis when  the latter is false.},
+topics={Comparative},cat={J},
+}
+
+@ARTICLE{NC06,
+    author = {Bengio, Yoshua and Monperrus, Martin and Larochelle, Hugo},
+     title = {Nonlocal Estimation of Manifold Structure},
+   journal = {Neural Computation},
+    volume = {18},
+      year = {2006},
+     pages = {2509--2528},
+  abstract = {We claim and present arguments to the effect that a large class of manifold
+learning algorithms that are essentially local and can be framed as
+kernel learning algorithms will suffer from the curse of dimensionality, at
+the dimension of the true underlying manifold. This observation suggests
+to explore non-local manifold learning algorithms which attempt to discover
+shared structure in the tangent planes at different positions. A criterion for
+such an algorithm is proposed and experiments estimating a tangent plane
+prediction function are presented, showing its advantages with respect to
+local manifold learning algorithms: it is able to generalize very far from
+training data (on learning handwritten character image rotations), where a
+local non-parametric method fails.},
+topics={HighDimensional,Kernel,Unsupervised},cat={J},
+}
+
+@INPROCEEDINGS{NIPS1-short,
+     editor = {Touretzky, D. S.},
+      title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
+  booktitle = {NIPS 1},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{NIPS10-short,
+     editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
+      title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
+  booktitle = {NIPS 10},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS11,
+     editor = {Kearns, M.J. and Solla, S.A.},
+      title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
+  booktitle = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS11-short,
+     editor = {Kearns, M.J. and Solla, S.A.},
+      title = {Advances in Neural Information Processing Systems 11 (NIPS'98)},
+  booktitle = {NIPS 11},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS12-short,
+     editor = {Solla, S.A. and Leen, T. K.},
+      title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
+  booktitle = {NIPS 12},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS13-short,
+     editor = {Leen, T. K. and Dietterich, T.G.},
+      title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
+  booktitle = {NIPS 13},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS14,
+     editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
+      title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
+  booktitle = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS14-short,
+     editor = {Dietterich, T.G. and Becker, S. and Ghahramani, Zoubin},
+      title = {Advances in Neural Information Processing Systems 14 (NIPS'01)},
+  booktitle = {NIPS 14},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS15-short,
+     editor = {Becker, S. and Thrun, Sebastian},
+      title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
+  booktitle = {NIPS 15},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS16-short,
+     editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
+      title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
+  booktitle = {NIPS 16},
+       year = {-1}
+}
+
+
+@INPROCEEDINGS{NIPS17-short,
+     editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
+      title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
+  booktitle = {NIPS 17},
+       year = {-1}
+}
+
+
+@INPROCEEDINGS{NIPS18-short,
+     editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
+      title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
+  booktitle = {NIPS 18},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS19-short,
+     editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
+      title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
+  booktitle = {NIPS 19},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS2-short,
+     editor = {Touretzky, D. S.},
+      title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
+  booktitle = {NIPS 2},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{NIPS20-short,
+     editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
+      title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
+  booktitle = {NIPS 20},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS2003_AA65,
+     author = {Bengio, Yoshua and Grandvalet, Yves},
+   keywords = {cross validation, error bars, generalization error inference, k-fold cross-validation, model selection, statistical comparison of algorithms, variance estimate},
+      title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
+       year = {2004},
+  publisher = {MIT Press},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/var-kfold-part1-nips.pdf},
+   crossref = {NIPS16},
+   abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare algorithm performances. In order to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the estimation of uncertainty around the K-fold cross-validation estimator. The main theorem shows that there exists no universal unbiased estimator of the variance of K-fold cross-validation. An analysis based on the eigendecomposition of the covariance matrix of errors helps to better understand the nature of the problem and shows that naive estimators may grossly underestimate variance, as confirmed by numerical experiments.},
+topics={Comparative},cat={C},
+}
+
+@INCOLLECTION{NIPS2005_424,
+    author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
+     title = {The Curse of Highly Variable Functions for Local Kernel Machines},
+      year = {2006},
+     pages = {107--114},
+  crossref = {NIPS18-shorter},
+  abstract = {We present a series of theoretical arguments supporting the claim that a
+large class of modern learning algorithms that rely solely on the smoothness
+prior – with similarity between examples expressed with a local
+kernel – are sensitive to the curse of dimensionality, or more precisely
+to the variability of the target. Our discussion covers supervised, semisupervised
+and unsupervised learning algorithms. These algorithms are
+found to be local in the sense that crucial properties of the learned function
+at x depend mostly on the neighbors of x in the training set. This
+makes them sensitive to the curse of dimensionality, well studied for
+classical non-parametric statistical learning. We show in the case of the
+Gaussian kernel that when the function to be learned has many variations,
+these algorithms require a number of training examples proportional to
+the number of variations, which could be large even though there may exist
+short descriptions of the target function, i.e. their Kolmogorov complexity
+may be low. This suggests that there exist non-local learning
+algorithms that at least have the potential to learn about such structured
+but apparently complex functions (because locally they have many variations),
+while not using very specific prior domain knowledge.},
+topics={HighDimensional,Kernel,Unsupervised},cat={C},
+}
+
+@INPROCEEDINGS{NIPS2005_456,
+    author = {K{\'{e}}gl, Bal{\'{a}}zs and Wang, Ligen},
+     title = {Boosting on Manifolds: Adaptive Regularization of Base Classifiers},
+      year = {2005},
+     pages = {665--672},
+  crossref = {NIPS17-shorter},
+  abstract = {In this paper we propose to combine two powerful ideas, boosting and manifold learning. On the one hand, we improve ADABOOST by incorporating knowledge on the structure of the data into base classifier design and selection. On the other hand, we use ADABOOST’s efficient learning mechanism to significantly improve supervised and semi-supervised algorithms proposed in the context of manifold learning. Beside the specific manifold-based penalization, the resulting algorithm also accommodates the boosting of a large family of regularized learning algorithms.},
+topics={Boosting},cat={C},
+}
+
+@INCOLLECTION{NIPS2005_519,
+    author = {Grandvalet, Yves and Bengio, Yoshua},
+     title = {Semi-supervised Learning by Entropy Minimization},
+      year = {2005},
+     pages = {529--236},
+  crossref = {NIPS17-shorter},
+  abstract = {We consider the semi-supervised learning problem, where a decision rule is to be learned from labeled and unlabeled data. In this framework, we motivate minimum entropy regularization, which enables to incorporate unlabeled data in the standard supervised learning. Our approach includes other approaches to the semi-supervised problem as particular or limiting cases. A series of experiments illustrates that the proposed solution benefits from unlabeled data. The method challenges mixture models when the data are sampled from the distribution class spanned by the generative model. The performances are definitely in favor of minimum entropy regularization when generative models are misspecified, and the weighting of unlabeled data provides robustness to the violation of the “cluster assumption”. Finally, we also illustrate that the method can also be far superior to manifold learning in high dimension spaces.},
+topics={Unsupervised},cat={C},
+}
+
+@INPROCEEDINGS{NIPS2005_539,
+    author = {Bengio, Yoshua and Larochelle, Hugo and Vincent, Pascal},
+     title = {Non-Local Manifold Parzen Windows},
+      year = {2006},
+  crossref = {NIPS18-shorter},
+  abstract = {To escape from the curse of dimensionality, we claim that one can learn
+non-local functions, in the sense that the value and shape of the learned
+function at x must be inferred using examples that may be far from x.
+With this objective, we present a non-local non-parametric density estimator.
+It builds upon previously proposed Gaussian mixture models with
+regularized covariance matrices to take into account the local shape of
+the manifold. It also builds upon recent work on non-local estimators of
+the tangent plane of a manifold, which are able to generalize in places
+with little training data, unlike traditional, local, non-parametric models.},
+topics={HighDimensional,Kernel,Unsupervised},cat={C},
+}
+
+@INPROCEEDINGS{NIPS2005_583,
+    author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice},
+     title = {Convex Neural Networks},
+      year = {2006},
+     pages = {123--130},
+  crossref = {NIPS18-shorter},
+  abstract = {Convexity has recently received a lot of attention in the machine learning
+community, and the lack of convexity has been seen as a major disadvantage
+of many learning algorithms, such as multi-layer artificial neural
+networks. We show that training multi-layer neural networks in which the
+number of hidden units is learned can be viewed as a convex optimization
+problem. This problem involves an infinite number of variables, but can be
+solved by incrementally inserting a hidden unit at a time, each time finding
+a linear classifier that minimizes a weighted sum of errors.},
+topics={Boosting},cat={C},
+}
+
+@INPROCEEDINGS{NIPS2005_663,
+    author = {Rivest, Fran{\c c}ois and Bengio, Yoshua and Kalaska, John},
+     title = {Brain Inspired Reinforcement Learning},
+      year = {2005},
+     pages = {1129--1136},
+  crossref = {NIPS17-shorter},
+  abstract = {Successful application of reinforcement learning algorithms often involves considerable hand-crafting of the necessary non-linear features to reduce the complexity of the value functions and hence to promote convergence of the algorithm. In contrast, the human brain readily and autonomously finds the complex features when provided with sufficient training. Recent work in machine learning and neurophysiology has demonstrated the role of the basal ganglia and the frontal cortex in mammalian reinforcement learning. This paper develops and explores new reinforcement learning algorithms inspired by neurological evidence that provides potential new approaches to the feature construction problem. The algorithms are compared and evaluated on the Acrobot task.},
+topics={BioRules},cat={C},
+}
+
+@INCOLLECTION{NIPS2005_691,
+    author = {Bengio, Yoshua and Monperrus, Martin},
+     title = {Non-Local Manifold Tangent Learning},
+      year = {2005},
+     pages = {129--136},
+  crossref = {NIPS17-shorter},
+  abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local and can be framed as kernel learning algorithms will suffer from the curse of dimensionality, at the dimension of the true underlying manifold. This observation suggests to explore non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented, showing its advantages with respect to local manifold learning algorithms: it is able to generalize very far from training data (on learning handwritten character image rotations), where a local non-parametric method fails.},
+topics={HighDimensional,Unsupervised},cat={C},
+}
+
+@INPROCEEDINGS{NIPS2005_874,
+    author = {K{\'{e}}gl, Bal{\'{a}}zs},
+     title = {Generalization Error and Algorithmic Convergence of Median Boosting},
+      year = {2005},
+     pages = {657--664},
+  crossref = {NIPS17-shorter},
+  abstract = {We have recently proposed an extension of ADABOOST to regression that uses the median of the base regressors as the final regressor. In this paper we extend theoretical results obtained for ADABOOST to median boosting and to its localized variant. First, we extend recent results on efficient margin maximizing to show that the algorithm can converge to the maximum achievable margin within a preset precision in a finite number of steps. Then we provide confidence-interval-type bounds on the generalization error.},
+topics={Boosting},cat={C},
+}
+
+@INPROCEEDINGS{NIPS2007-56,
+    author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
+     title = {Topmoumoute online natural gradient algorithm},
+      year = {2008},
+  crossref = {NIPS20-shorter},
+  abstract = {Guided by the goal of obtaining an optimization algorithm that is both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
+}
+
+@INPROCEEDINGS{NIPS2007-812,
+    author = {Chapados, Nicolas and Bengio, Yoshua},
+     title = {Augmented Functional Time Series Representation and Forecasting with Gaussian Processes},
+      year = {2008},
+     pages = {265--272},
+  crossref = {NIPS20-shorter},
+  abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
+}
+
+@INPROCEEDINGS{NIPS2007-925,
+    author = {Le Roux, Nicolas and Bengio, Yoshua and Lamblin, Pascal and Joliveau, Marc and K{\'{e}}gl, Bal{\'{a}}zs},
+     title = {Learning the 2-D Topology of Images},
+      year = {2008},
+     pages = {841--848},
+  crossref = {NIPS20-shorter},
+  abstract = {We study the following question: is the two-dimensional structure of images a very strong prior or is it something that can be learned with a few examples of natural images? If someone gave us a learning task involving images for which the two-dimensional topology of pixels was not known, could we discover it automatically and exploit it? For example suppose that the pixels had been permuted in a fixed but unknown way, could we recover the relative two-dimensional location of pixels on images? The surprising result presented here is that not only the answer is yes but that about as few as a thousand images are enough to approximately recover the relative locations of about a thousand pixels. This is achieved using a manifold learning algorithm applied to pixels associated with a measure of distributional similarity between pixel intensities. We compare different topologyextraction approaches and show how having the two-dimensional topology can be exploited.}
+}
+
+@INPROCEEDINGS{NIPS21,
+     editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
+      title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
+  booktitle = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
+       year = {-1},
+  publisher = {Nips Foundation (http://books.nips.cc)}
+}
+
+@INPROCEEDINGS{NIPS21-short,
+     editor = {Koller, D. and Schuurmans, Dale and Bengio, Yoshua and Bottou, {L{\'{e}}on}},
+      title = {Advances in Neural Information Processing Systems 21 (NIPS'08)},
+  booktitle = {NIPS 21},
+       year = {-1},
+  publisher = {Nips Foundation (http://books.nips.cc)}
+}
+
+
+@INPROCEEDINGS{NIPS22-short,
+     editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
+      title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
+  booktitle = {NIPS 22},
+       year = {-1}
+}
+
+
+@INPROCEEDINGS{NIPS3,
+     editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
+      title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
+  booktitle = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{NIPS3-short,
+     editor = {Lipmann, R. P. and Moody, J. E. and Touretzky, D. S.},
+      title = {Advances in Neural Information Processing Systems 3 (NIPS'90)},
+  booktitle = {NIPS 3},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{NIPS4-short,
+     editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
+      title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
+  booktitle = {NIPS 4},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{NIPS5,
+     editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
+      title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
+  booktitle = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{NIPS5-short,
+     editor = {Giles, C.L. and Hanson, S. J. and Cowan, J. D.},
+      title = {Advances in Neural Information Processing Systems 5 (NIPS'92)},
+  booktitle = {NIPS 5},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+
+@INPROCEEDINGS{NIPS6-short,
+     editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
+      title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
+  booktitle = {NIPS 6},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS7-short,
+     editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
+      title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
+  booktitle = {NIPS 7},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS8-short,
+     editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
+      title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
+  booktitle = {NIPS 8},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{NIPS9-short,
+     editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
+      title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
+  booktitle = {NIPS 9},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+
+@INPROCEEDINGS{nnlm:2001:nips,
+    author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal},
+     title = {A Neural Probabilistic Language Model},
+      year = {2001},
+  crossref = {NIPS13-shorter},
+  abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words. This is intrinsically difficult because of the curse of dimensionality: we propose to fight it with its own weapons. In the proposed approach one learns simultaneously (1) a distributed representation for each word (i.e. a similarity between words) along with (2) the probability function for word sequences, expressed with these representations. Generalization is obtained because a sequence of words that
+has never been seen before gets high probability if it is made of words that are similar to words forming an already seen sentence. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model.},
+topics={Markov,Unsupervised,Language},cat={C},
+}
+
+@INPROCEEDINGS{nsvn:2000:ijcnn,
+     author = {Vincent, Pascal and Bengio, Yoshua},
+      title = {A Neural Support Vector Network Architecture with Adaptive Kernels},
+  booktitle = {International Joint Conference on Neural Networks 2000},
+     volume = {V},
+       year = {2000},
+      pages = {187--192},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nsvn.pdf},
+   abstract = {In the Support Vector Machines ({SVM}) framework, the positive-definite kernel can be seen as representing a fixed similarity measure between two patterns, and a discriminant function is obtained by taking a linear combination of the kernels computed at training examples called support vectors. Here we investigate learning architectures in which the kernel functions can be replaced by more general similarity measures that can have arbitrary internal parameters. The training criterion used in {SVM}s is not appropriate for this purpose so we adopt the simple criterion that is generally used when training neural networks for classification tasks. Several experiments are performed which show that such Neural Support Vector Networks perform similarly to {SVM}s while requiring significantly fewer support vectors, even when the similarity measure has no internal parameters.},
+topics={Kernel},cat={C},
+}
+
+@INPROCEEDINGS{Ouimet+al-2005,
+     author = {Ouimet, Marie and Bengio, Yoshua},
+     editor = {Cowell, Robert G. and Ghahramani, Zoubin},
+      title = {Greedy Spectral Embedding},
+  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},
+       year = {2005},
+      pages = {253--260},
+  publisher = {Society for Artificial Intelligence and Statistics},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/greedy-kernel-aistats05.pdf},
+   abstract = {Spectral dimensionality reduction methods and spectral clustering methods require computation of the principal eigenvectors of an n X n matrix where n is the number of examples. Following up on previously proposed techniques to speed-up kernel methods by focusing on a subset of m examples, we study a greedy selection procedure for this subset, based on the feature space distance between a candidate example and the span of the previously chosen ones. In the case of kernel {PCA} or spectral clustering this reduces computation to O(m^2 n). For the same computational complexity, we can also compute the feature space projection of the non-selected examples on the subspace spanned by the selected examples, to estimate the embedding function based on all the data, which yields considerably better estimation of the embedding function. This algorithm can be formulated in an online setting and we can bound the error on the approximation of the Gram matrix.},
+topics={HighDimensional,kenel},cat={C},
+}
+
+@MASTERSTHESIS{Ouimet-Msc-2004,
+    author = {Ouimet, Marie},
+  keywords = {algorithmes voraces., apprentissage non-supervis{\'{e}}, m{\'{e}}thodes spectrales, noyaux, r{\'{e}}duction de dimensionnalit{\'{e}}},
+     title = {R{\'{e}}duction de dimensionnalit{\'{e}} non lin{\'{e}}aire et vorace},
+      year = {2004},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {Les m{\'{e}}thodes spectrales de r{\'{e}}duction de dimensionnalit{\'{e}} et les m{\'{e}}thodes de segmentation spectrale exigent le calcul des vecteurs propres principaux d'une matrice de taille n x n o{\`{u}} n est le nombre d'exemples. Des techniques ont {\'{e}}t{\'{e}} propos{\'{e}}es dans la litt{\'{e}}rature pour acc{\'{e}}l{\'{e}}rer les m{\'{e}}thodes {\`{a}} noyau en se concentrant sur un sous-ensemble de m exemples. Nous proposons une proc{\'{e}}dure vorace pour la s{\'{e}}lection de ce sous-ensemble, qui est bas{\'{e}}e sur la distance dans l'espace des caract{\`{e}}ristiques entre un exemple candidat et le sous-espace g{\'{e}}n{\'{e}}r{\'{e}} par les exemples pr{\'{e}}c{\'{e}}demment choisis. Dans le cas de l'ACP {\`{a}} noyau ou de la segmentation spectrale, nous obtenons un algorithme en O(m*m*n), o{\`{u}} m << n, qui, contrairement aux techniques pr{\'{e}}c{\'{e}}demment propos{\'{e}}es, peut se formuler de fa{\c c}on en-ligne. Pour la m{\^{e}}me complexit{\'{e}} en temps, nous pouvons {\'{e}}galement calculer la projection des exemples non choisis sur le sous-espace engendr{\'{e}} par les exemples choisis dans l'espace des caract{\'{e}}ristiques. En repr{\'{e}}sentant ainsi les exemples par leur projection nous obtenons une approximation de plus faible rang de la matrice de Gram sur toutes les donn{\'{e}}es. Nous pouvons {\'{e}}galement borner l'erreur correspondant {\`{a}} cette approximation de la matrice de Gram.}
+}
+
+@ARTICLE{paiement+bengio+eck:aij,
+    author = {Paiement, Jean-Fran{\c c}ois and Bengio, Samy and Eck, Douglas},
+     title = {Probabilistic Models for Melodic Prediction},
+   journal = {Artificial Intelligence Journal},
+    volume = {173},
+      year = {2009},
+     pages = {1266-1274},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@INPROCEEDINGS{paiement+eck+bengio+barber:icml2005,
+     author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy and Barber, D.},
+      title = {A graphical model for chord progressions embedded in a psychoacoustic space},
+       year = {2005},
+      pages = {641--648},
+  publisher = {ACM Press},
+   crossref = {ICML05},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{paiement+eck+bengio:ccai2006,
+     author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
+     editor = {Lamontagne, Luc and Marchand, Mario},
+      title = {Probabilistic Melodic Harmonization},
+  booktitle = {Canadian Conference on AI},
+     series = {Lecture Notes in Computer Science},
+     volume = {4013},
+       year = {2006},
+      pages = {218-229},
+  publisher = {Springer},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{paiement+eck+bengio:ismir2005,
+     author = {Paiement, Jean-Fran{\c c}ois and Eck, Douglas and Bengio, Samy},
+      title = {A Probabilistic Model for Chord Progressions},
+  booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}},
+       year = {2005},
+      pages = {312-319},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@INPROCEEDINGS{paiement+grandvalet+bengio+eck:icml2008,
+    author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
+     title = {A generative model for rhythms},
+      year = {2008},
+     pages = {},
+  crossref = {ICML06-shorter},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@UNPUBLISHED{paiement+grandvalet+bengio+eck:nipsworkshop2007,
+    author = {Paiement, Jean-Fran{\c c}ois and Grandvalet, Yves and Bengio, Samy and Eck, Douglas},
+     title = {A generative model for rhythms},
+      year = {2007},
+      note = {NIPS 2007 Workshop on Music, Brain and Cognition},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@MASTERSTHESIS{Paiement-Msc-2003,
+    author = {Paiement, Jean-Fran{\c c}ois},
+  keywords = {algorithmes, apprentissage, apprentissage non supervis{\'{e}}, forage de donn{\'{e}}es, noyaux, r{\'{e}}duction de dimensions, statistique, Statistiques},
+     title = {G{\'{e}}n{\'{e}}ralisation d'algorithmes de r{\'{e}}duction de dimension},
+      year = {2003},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {On pr{\'{e}}sente tout d'abord la notion de vari{\'{e}}t{\'{e}} comme r{\'{e}}gion de faible dimension contenant des observations situ{\'{e}}es dans un espace de haute dimension. Cette d{\'{e}}finition justifie l'{\'{e}}laboration d'algorithmes permettant d'exprimer les donn{\'{e}}es dans un syst{\`{e}}me de coordonn{\'{e}}es de dimensions {\'{e}}gale {\`{a}} celle de la vari{\'{e}}t{\'{e}} sur laquelle les donn{\'{e}}es sont approximativement situ{\'{e}}es.
+La notion de noyau comme mesure de similarit{\'{e}} est par la suite formalis{\'{e}}e. On constate que l'application d'un noyau {\`{a}} deux observations correspond {\`{a}} l'{\'{e}}valuation d'un produit scalaire dans un espace de Hilbert appel{\'{e}} espace de caract{\'{e}}ristiques.
+Une m{\'{e}}thode de r{\'{e}}duction de dimension lin{\'{e}}raire est expos{\'{e}}e ainsi que ces limites. Des algorithmes non lin{\'{e}}raires de r{\'{e}}duction de dimension et de segmentation permettent de s'affranchir de ces limites. Ces derniers ne fournissent cependant pas d'extension directe {\`{a}} des points hors {\'{e}}chantillon.
+L'{\'{e}}tape fondamentale au sein des algorithmes pr{\'{e}}sent{\'{e}}s est la solution d'un syst{\`{e}}me de vecteurs propres d'une matrice sym{\'{e}}trique cr{\'{e}}{\'{e}}e {\`{a}} partir d'un noyau d{\'{e}}pendant des donn{\'{e}}es. On con{\c c}oit cd probl{\`{e}}me comme le fait de trouver les fonctions propres d'un op{\'{e}}rateur lin{\'{e}}aire d{\'{e}}fini {\`{a}} partir du m{\^{e}}me noyau. On utilise alors la formulation de Nystr{\"{o}}m, pr{\'{e}}sente dans l'espace en composantes principales {\`{a}} noyaux, afin de r{\'{e}}duire la dimension des points hors {\'{e}}chantillon sur la vase des plongements obtenus {\`{a}} l'aide des algorithmes d{\'{e}}j{\`{a}} mentionn{\'{e}}s.
+La qualit{\'{e}} de la projection g{\'{e}}n{\'{e}}r{\'{e}}e est compar{\'{e}}e {\`{a}} la perturbation intrins{\`{e}}que des algorithmes si on substitue certaine observations par d'autres tir{\'{e}}es de la m{\^{e}}me distribution.}
+}
+
+@ARTICLE{perez+gers+schmidhuber+eck:2002,
+    author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
+     title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
+   journal = {Neural Networks},
+    volume = {16},
+    number = {2},
+      year = {2003},
+  abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@ARTICLE{perez+gers+schmidhuber+eck:2003,
+    author = {Perez-Ortiz, J. A. and Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen},
+     title = {{K}alman filters improve {LSTM} network performance in problems unsolvable by traditional recurrent nets},
+   journal = {Neural Networks},
+    volume = {16},
+    number = {2},
+      year = {2003},
+     pages = {241--250},
+  abstract = {The Long Short-Term Memory ({LSTM}) network trained by gradient descent solves difficult problems which traditional recurrent neural networks in general cannot. We have recently observed that the decoupled extended Kalman filter training algorithm allows for even better performance, reducing significantly the number of training steps when compared to the original gradient descent training algorithm. In this paper we present a set of experiments which are unsolvable by classical recurrent networks but which are solved elegantly and robustly and quickly by {LSTM} combined with Kalman filters.},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@INPROCEEDINGS{perez+schmidhuber+gers+eck:icannB2002,
+     author = {Perez-Ortiz, J. A. and Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
+     editor = {Dorronsoro, J.},
+      title = {Improving Long-Term Online Prediction with {Decoupled Extended Kalman Filters}},
+  booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}},
+       year = {2002},
+      pages = {1055--1060},
+  publisher = {Springer},
+   abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) outperform traditional {RNN}s when dealing with sequences involving not only short-term but also long-term dependencies. The decoupled extended Kalman filter learning algorithm ({DEKF}) works well in online environments and reduces significantly the number of training steps when compared to the standard gradient-descent algorithms. Previous work on {LSTM}, however, has always used a form of gradient descent and has not focused on true online situations. Here we combine {LSTM} with {DEKF} and show that this new hybrid improves upon the original learning algorithm when applied to online processing.},
+source={OwnPublication},
+sourcetype={Conference},
+}
+
+@TECHREPORT{Pigeon-Bengio-96-aH-TR,
+       author = {Pigeon, Steven and Bengio, Yoshua},
+        title = {A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols},
+       number = {\#1081},
+         year = {1997},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/HuffAdapt.pdf},
+     abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithm implicitly assumes that either we have a small alphabet —  quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbol, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the number of observed symbols.},
+topics={Compression},cat={T},
+}
+
+@INPROCEEDINGS{Pigeon-dcc98,
+     author = {Pigeon, Steven and Bengio, Yoshua},
+     editor = {Society, {IEEE} Computer},
+      title = {A Memory-Efficient Adaptive Huffman Coding Algorithm for Very Large Sets of Symbols},
+  booktitle = {Data Compression Conference},
+       year = {1998},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc98.pdf},
+   abstract = {The problem of computing the minimum redundancy codes as we observe symbols one by one has received a lot of attention. However, existing algorithms implicitly assumes that either we have a small alphabet — quite typically 256 symbols — or that we have an arbitrary amount of memory at our disposal for the creation of the tree. In real life applications one may need to
+encode symbols coming from a much larger alphabet, for e.g. coding integers. We now have to deal not with hundreds of symbols but possibly with millions of symbols. While other algorithms use a space proportional to the number of observed symbols, we here propose one that uses space proportional to the number of frequency classes, which is, quite interestingly, always smaller or equal to the size of the alphabet.},
+topics={Compression},cat={C},
+}
+
+@INPROCEEDINGS{Pigeon-dcc99,
+     author = {Pigeon, Steven and Bengio, Yoshua},
+     editor = {Society, {IEEE} Computer},
+      title = {Binary Pseudowavelets and Applications to Bilevel Image Processing},
+  booktitle = {Data Compression Conference},
+       year = {1999},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dcc99.pdf},
+   abstract = {This paper shows the existance of binary pseudowavelets, bases on the binary domain that exhibit some of the properties of wavelets, such as multiresolution reconstruction and compact support. The binary pseudowavelets are defined on _n (binary vectors of length n) and are operated upon with the binary operators logical and and exclusive or. The forward transform, or analysis, is the decomposition of a binary vector into its constituant binary pseudowavelets. Binary pseudowavelets allow multiresolution, progressive reconstruction of binary vectors by using progressively more coefficients in the inverse transform. Binary pseudowavelets bases, being sparse matrices, also provide for fast transforms; moreover pseudowavelets rely on hardware-friendly operations for efficient software and hardware implementation.},
+topics={Compression},cat={C},
+}
+
+@TECHREPORT{Pigeon-Huffman-TR98,
+       author = {Pigeon, Steven and Bengio, Yoshua},
+        title = {A Memory-Efficient Adaptive Huffman Coding for Very Large Sets of Symbols revisited},
+       number = {1095},
+         year = {1998},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TechRep_AdaptativeHuffman2.pdf},
+     abstract = {While algorithm M (presented in A Memory-Efficient Huffman Adaptive Coding Algorithm for Very Large Sets of Symbols, by Steven Pigeon & Yoshua Bengio, Universit{\'{e}} de Montr{\'{e}}al technical report #1081 [1]) converges to the entropy of the signal, it also assumes that the characteristics of the signal are stationary, that is, that they do not change over time and that successive adjustments, ever decreasing in their magnitude, will lead to a reasonable approximation of the entropy. While this is true for some data, it is clearly not true for some other. We present here a modification of the M algorithm that allows negative updates. Negative updates are used to maintain a window over the source. Symbols enter the window at its right and will leave it at its left, after w steps (the window width). The algorithm presented here allows us to update correctly the weights of the symbols in the symbol tree. Here, we will also have negative migration or demotion, while we only had positive migration or promotion in M. This algorithm will be called M+.},
+topics={Compression},cat={T},
+}
+
+@PHDTHESIS{Pigeon-Phd-2001,
+    author = {Pigeon, Steven},
+  keywords = {algorithmes, codes adaptatifs, codes de Golomb, codes universels, Compression de donn{\'{e}}es, compression LZ78, LZW, ondelettes, pseudo-ondelettes},
+     title = {Contributions {\`{a}} la compression de donn{\'{e}}es},
+      year = {2001},
+    school = {Universit{\'{e}} de Montr{\'{e}}al},
+  abstract = {L'objectif de cette th{\`{e}}se est de pr{\'{e}}senter nos contributions {\`{a}} la compression de donn{\'{e}}es. Le texte entier n'est pas consacr{\'{e}} {\`{a}} nos seules contributions. Une large part est consacr{\'{e}}e au mat{\'{e}}riel introductif et {\`{a}} la recension de litt{\'{e}}rature sur les sujets qui sont pertinents {\`{a}} nos contributions. Le premier chapitre de contribution, le chapitre "Contribution au codage des entiers" se concentre sur le probl{\`{e}}me de la g{\'{e}}n{\'{e}}ration de codes efficaces pour les entiers. Le chapitre "Codage Huffman Adaptatif" pr{\'{e}}sente deux nouveaux algorithmes pour la g{\'{e}}n{\'{e}}ration dynamique de codes structur{\'{e}}s en arbre, c'est-{\`{a}}-dire des codes de type Huffman. Le chapitre "LZW avec une perte" explore le probl{\`{e}}me de la compression d'images comportant un petit nombre de couleurs distinctes et propose une extension avec perte d'un algorithme originalement sans perte, LZW. Enfin, le dernier chapitre de contribution, le chapitre "Les pseudo-ondelettes binaires" pr{\'{e}}sente une solution original au probl{\`{e}}me de l'analyse multir{\'{e}}solution des images monochromes, c'est-{\`{a}}-dire des images n'ayant que deux couleurs, conventionnellement noir et blanc. Ce type d'image correspond par exemple aux images textuelles telle que produites par un processus de transmission de type facsimil{\'{e}}.}
+}
+
+@ARTICLE{Pigeon98,
+    author = {Pigeon, Steven and Bengio, Yoshua},
+     title = {Memory-Efficient Adaptive Huffman Coding},
+   journal = {Dr. Dobb's Journal},
+    volume = {290},
+      year = {1998},
+     pages = {131--135},
+topics={Compression},cat={J},
+}
+
+@INPROCEEDINGS{probnn:2000:ijcnn,
+     author = {Bengio, Yoshua},
+      title = {Probabilistic Neural Network Models for Sequential Data},
+  booktitle = {International Joint Conference on Neural Networks 2000},
+     volume = {V},
+       year = {2000},
+      pages = {79--84},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/81_01.PDF},
+   abstract = {It has already been shown how Artificial Neural Networks ({ANN}s) can be incorporated into probabilistic models.
+In this paper we review some of the approaches which have been proposed to incorporate them into probabilistic
+models of sequential data, such as Hidden {Markov} Models ({HMM}s). We also discuss new developments and new
+ideas in this area, in particular how {ANN}s can be used to model high-dimensional discrete and continuous data to
+deal with the curse of dimensionality, and how the ideas proposed in these models could be applied to statistical
+language modeling to represent longer-term context than allowed by trigram models, while keeping word-order
+information.},
+topics={Markov},cat={C},
+}
+
+@UNPUBLISHED{pugin+burgoyne+eck+fujinaga:nipsworkshop2007,
+    author = {Pugin, L. and Burgoyne, J. A. and Eck, Douglas and Fujinaga, I.},
+     title = {Book-adaptive and book-dependant models to accelerate digitalization of early music},
+      year = {2007},
+      note = {NIPS 2007 Workshop on Music, Brain and Cognition},
+source={OwnPublication},
+sourcetype={Workshop},
+optkey={""},
+optmonth={""},
+optannote={""},
+}
+
+@INPROCEEDINGS{Rahim-97,
+     author = {Rahim, Mazin and Bengio, Yoshua and {LeCun}, Yann},
+      title = {Discriminative feature and model design for automatic speech recognition},
+  booktitle = {Proceedings of Eurospeech 1997},
+       year = {1997},
+      pages = {75--78},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rahim-bengio-lecun-97.ps.gz},
+   abstract = {A system for discriminative feature and model design is presented for automatic speech recognition. Training based on minimum classification error with a single objective function is applied for designing a set of parallel networks performing feature transformation and a set of hidden {Markov} models performing speech recognition. This paper compares the use of linear and non-linear functional transformations when applied to conventional recognition features, such as spectrum or cepstrum. It also provides a framework for integrated feature and model training when using class-specific transformations. Experimental results on telephone-based connected digit recognition are presented.},
+topics={Speech},cat={C},
+}
+
+@ARTICLE{Rivest-2009,
+    author = {Rivest, Fran{\c c}ois and Kalaska, John and Bengio, Yoshua},
+     title = {Alternative Time Representations in Dopamine Models},
+   journal = {Journal of Computational Neuroscience},
+    volume = {28},
+    number = {1},
+      year = {2009},
+     pages = {107--130},
+  abstract = {Dopaminergic neuron activity has been modeled during learning and appetitive behavior, most commonly using the temporal-difference (TD) algorithm. However, a proper representation of elapsed time and of the exact task is usually required for the model to work. Most models use timing elements such as delay-line representations of time that are not biologically realistic for intervals in the range of seconds. The interval-timing literature provides several alternatives. One of them is that timing could emerge from general network dynamics, instead of coming from a dedicated circuit. Here, we present a general rate-based learning model based on long short-term memory ({LSTM}) networks that learns a time representation when needed. Using a na{\"{\i}}ve network learning its environment in conjunction with TD, we reproduce dopamine activity in appetitive trace conditioning with a constant CS-US interval, including probe trials with unexpected delays. The proposed model learns a representation of the environment dynamics in an adaptive biologically plausible framework, without recourse to delay lines or other special-purpose circuits. Instead, the model predicts that the task-dependent representation of time is learned by experience, is encoded in ramp-like changes in single-neuron activity distributed across small neural networks, and reflects a temporal integration mechanism resulting from the inherent dynamics of recurrent loops within the network. The model also reproduces the known finding that trace conditioning is more difficult than delay conditioning and that the learned representation of the task can be highly dependent on the types of trials experienced during training. Finally, it suggests that the phasic dopaminergic signal could facilitate learning in the cortex.}
+}
+
+@ARTICLE{schmidhuber+gers+eck:2002,
+    author = {Schmidhuber, Juergen and Gers, F. A. and Eck, Douglas},
+     title = {Learning Nonregular Languages: A Comparison of Simple Recurrent Networks and {LSTM}},
+   journal = {Neural Computation},
+    volume = {14},
+    number = {9},
+      year = {2002},
+     pages = {2039--2041},
+  abstract = {In response to Rodriguez' recent article (Rodriguez 2001) we compare the performance of simple recurrent nets and {\em ``Long Short-Term Memory''} ({LSTM}) recurrent nets on context-free and context-sensitive languages.},
+source={OwnPublication},
+sourcetype={Journal},
+}
+
+@TECHREPORT{Schwenk-Bengio-97-TR,
+       author = {Schwenk, Holger and Bengio, Yoshua},
+        title = {Adaptive Boosting of Neural Networks for Character Recognition},
+       number = {\#1072},
+         year = {1997},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/AdaBoostTR.pdf},
+     abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
+topics={Boosting,Speech},cat={T},
+}
+
+@INPROCEEDINGS{Schwenk-nips10,
+    author = {Schwenk, Holger and Bengio, Yoshua},
+     title = {Training Methods for Adaptive Boosting of Neural Networks for Character Recognition},
+      year = {1998},
+  crossref = {NIPS10-shorter},
+  abstract = {”Boosting” is a general method for improving the performance of any learning algorithm that consistently generates classifiers which need to perform only slightly better than random guessing. A recently proposed and very promising boosting algorithm is AdaBoost [5]. It has been applied with great success to several benchmark machine learning problems using rather simple learning algorithms [4], in particular decision trees [1, 2, 6]. In this paper we use AdaBoost to improve the performances of neural networks applied to character recognition tasks. We compare training methods based on sampling the training set and weighting the cost function. Our system achieves about 1.4\% error on a data base of online handwritten digits from more than 200 writers. Adaptive boosting of a multi-layer network achieved 2\% error on the UCI Letters offline characters data set.},
+topics={Boosting,Speech},cat={C},
+}
+
+@ARTICLE{Schwenk2000,
+    author = {Schwenk, Holger and Bengio, Yoshua},
+     title = {Boosting Neural Networks},
+   journal = {Neural Computation},
+    volume = {12},
+    number = {8},
+      year = {2000},
+     pages = {1869--1887},
+  abstract = {“Boosting” is a general method for improving the performance of learning algorithms. A recently proposed boosting algorithm is AdaBoost. It has been applied with great success to several benchmark machine learning problems using mainly decision trees as base classifiers. In this paper we investigate whether AdaBoost also works as well with neural networks, and we discuss the advantages and drawbacks of di_erent versions of the AdaBoost algorithm. In particular, we compare training methods based on sampling the training set and weighting the cost function. The results suggest that random resampling of the training data is not the main explanation of the success of the improvements brought by AdaBoost. This is in contrast to Bagging which directly aims at reducing variance and for which random resampling is essential to obtain the reduction in generalization error. Our system achieves about 1.4\% error on a data set of online handwritten digits from more than 200 writers. A boosted multi-layer network achieved 1.5\% error on the UCI Letters and 8.1\% error on the UCI satellite data set, which is significantly better than boosted decision trees.},
+topics={Boosting},cat={J},
+}
+
+@INPROCEEDINGS{secondorder:2001:nips,
+    author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene},
+     title = {Incorporating Second-Order Functional Knowledge for Better Option Pricing},
+      year = {2001},
+  crossref = {NIPS13-shorter},
+  abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in two of its arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of continuous functions with these and other properties. We apply this new class of functions to the task of modeling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.},
+topics={Finance},cat={C},
+}
+
+@ARTICLE{Sonnenburg+al-2007,
+    author = {Sonnenburg, Soeren and et al. and Vincent, Pascal},
+     title = {The Need for Open Source Software in Machine Learning.},
+      year = {2007},
+      note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
+  crossref = {JMLR-shorter},
+  abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.
+
+Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
+}
+
+@ARTICLE{Takeuchi-Bengio-Kanamori-2002,
+    author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
+     title = {Robust Regression with Asymmetric Heavy-Tail Noise Distributions},
+   journal = {Neural Computation},
+    volume = {14},
+    number = {10},
+      year = {2002},
+     pages = {2469--2496},
+  abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they down-weight  the influence of so-called outliers. When the noise distribution is assymetric these methods yield biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regession that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust etimators of the regression), and to lern a few remaining parameters to combbine and correct these stimators, to unbiasedly minimize the average squared error. Theoritical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
+topics={Mining},cat={J},
+}
+
+@ARTICLE{Thierry+al-2008,
+    author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul},
+     title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases},
+   journal = {Journal of New Music Research},
+      year = {2008},
+  abstract = {Social tags are user-generated keywords associated with some resource on the Web. In the case of music, social tags have become an important component of "Web 2.0" recommender systems, allowing users to generate playlists based on use-dependent terms such as chill or jogging that have been applied to particular songs. In this paper, we propose a method for predicting these social tags directly from MP3 files. Using a set of 360 classifiers trained using the online ensemble learning algorithm FilterBoost, we map audio features onto social tags collected from the Web. The resulting automatic tags (or autotags) furnish information about music that is otherwise untagged or poorly tagged, allowing for insertion of previously unheard music into a social recommender. This avoids the “cold-start problem” common in such systems. Autotags can also be used to smooth the tag space from which similarities and
+recommendations are made by providing a set of comparable  baseline tags for all tracks in a recommender system. Because the words we learn are the same as those used by people who label their music collections, it is easy to integrate our predictions into existing similarity and prediction methods based on web data.}
+}
+
+@ARTICLE{Thivierge+al-2007,
+    author = {Thivierge, J. -P. and Rivest, Fran{\c c}ois and Monchi, O},
+     title = {Spiking Neurons, Dopamine, and Plasticity: Timing Is Everything, But Concentration Also Matters},
+   journal = {Synapse},
+    volume = {61},
+      year = {2007},
+     pages = {375-390},
+  abstract = {While both dopamine (DA) fluctuations and spike-timing-dependent plasticity (STDP) are known to influence long-term corticostriatal plasticity, little attention has been devoted to the interaction between these two fundamental mechanisms. Here, a theoretical framework is proposed to account for experimental results specifying the role of presynaptic activation, postsynaptic activation, and concentrations of extracellular DA in synaptic plasticity. Our starting point was an explicitly-implemented multiplicative rule linking STDP to Michaelis-Menton equations that models the dynamics of extracellular DA fluctuations. This rule captures a wide range of results on conditions leading to long-term potentiation and depression in simulations that manipulate the frequency of induced corticostriatal stimulation and DA release. A well-documented biphasic function relating DA concentrations to synaptic plasticity emerges naturally from simulations involving a multiplicative rule linking DA and neural activity. This biphasic function is found consistently across different neural coding schemes employed (voltage-based vs. spike-based models). By comparison, an additive rule fails to capture these results. The proposed framework is the first to generate testable predictions on the dual influence of DA concentrations and STDP on long-term plasticity, suggesting a way in which the biphasic influence of DA concentrations can modulate the direction and magnitude of change induced by STDP, and raising the possibility that DA concentrations may inverse the LTP/LTD components of the STDP rule.}
+}
+
+@TECHREPORT{tonga-tr,
+       author = {Le Roux, Nicolas and Manzagol, Pierre-Antoine and Bengio, Yoshua},
+        title = {Topmoumoute online natural gradient algorithm},
+       number = {1299},
+         year = {2007},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+     abstract = {Guided by the goal of obtaining an optimization algorithm that is
+both fast and yielding good generalization, we study the descent direction maximizing the decrease in generalization error or the probability of not increasing generalization error. The surprising result is that from both the Bayesian and frequentist perspectives this can yield the natural gradient direction. Although that direction can be very expensive to compute we develop an efficient, general, online approximation to the natural gradient descent which is suited to large scale problems. We report experimental results showing much faster convergence in computation time and in number of iterations with TONGA (Topmoumoute Online natural Gradient Algorithm) than with stochastic gradient descent, even on very large datasets.}
+}
+
+@TECHREPORT{TR1197,
+       author = {Vincent, Pascal and Bengio, Yoshua},
+        title = {K-Local Hyperplane and Convex Distance Nearest Neighbor Algorithms},
+       number = {1197},
+         year = {2001},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1197.pdf},
+     abstract = {Guided by an initial idea of building a complex (non linear) decision surface with maximal local margin in input space, we give a possible geometrical intuition as to why K-Nearest Neighbor ({KNN}) algorithms often perform more poorly than {SVM}s on classification tasks. We then propose modified K-Nearest Neighbor algorithms to overcome the perceived problem. The approach is similar in spirit to Tangent Distance, but with invariances inferred from the local neighborhood rather than prior knowledge. Experimental results on real world classification tasks suggest that the modified {KNN} algorithms often give a dramatic improvement over standard {KNN} and perform as well or better than {SVM}s.},
+topics={Kernel},cat={T},
+}
+
+@TECHREPORT{TR1198,
+       author = {Takeuchi, Ichiro and Bengio, Yoshua and Kanamori, Takafumi},
+        title = {Robust Regression with Asymmetric Heavy-Tail Noise},
+       number = {1198},
+         year = {2001},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1198.pdf},
+     abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they downweight the influence of so-called outliers. When the noise distribution is asymmetric these methods yield strongly biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regression that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust estimators of the regression), and to learn a few remaining parameters to combine and correct these estimators, to minimize the average squared error. Theoretical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
+topics={Mining},cat={T},
+}
+
+@TECHREPORT{TR1199,
+       author = {Chapados, Nicolas and Bengio, Yoshua and Vincent, Pascal and Ghosn, Joumana and Dugas, Charles and Takeuchi, Ichiro and Meng, Linyan},
+        title = {Estimating Car Insurance Premia: a Case Study in High-Dimensional Data Inference},
+       number = {1199},
+         year = {2001},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1199.pdf},
+     abstract = {Estimating insurance premia from data is a difficult regression problem for several reasons: the large number of variables, many of which are discrete, and the very peculiar shape of the noise distribution, asymmetric with fat tails, with a large majority zeros and a few unreliable and very large values. We introduce a methodology for estimating insurance premia that has been applied in the car insurance industry. It is based on mixtures of specialized neural networks, in order to reduce the effect of outliers on the estimation. Statistical comparisons with several different alternatives, including decision trees and generalized linear models show that the proposed method is significantly more precise, allowing to identify the least and most risky contracts, and reducing the median premium by charging more to the most risky customers.},
+topics={HighDimensional,Mining},cat={T},
+}
+
+@TECHREPORT{TR1200,
+       author = {Bengio, Yoshua and Chapados, Nicolas},
+        title = {Extending Metric-Based Model Selection and Regularization in the Absence of Unlabeled Data},
+       number = {1200},
+         year = {2001},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/lisa/pointeurs/TR1200.ps},
+     abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over all the alternatives tried (including cross-validation). However, these methods require a large set of unlabeled data, which is not always available in many applications. In this paper we extend these methods (TRI, ADJ and ADA) to the case where no unlabeled data is available. The extended methods (xTRI, xADJ, xADA) use a model of the input density directly estimated from the training set. The intuition is that the main reason why the above methods work well is that they make sure that the learned function behaves similarly on the training points and on “neighboring” points. The experiments are based on estimating a simple non-parametric density model. They show that the extended methods perform comparably to the originals even though no unlabeled data is used.},
+topics={ModelSelection,Finance},cat={T},
+}
+
+@TECHREPORT{TR1215,
+       author = {Bengio, Yoshua},
+        title = {New Distributed Probabilistic Language Models},
+       number = {1215},
+         year = {2002},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1215.ps},
+     abstract = {Our previous work on statistical modeling introduced the use of probabilistic feedforward neural networks with shared parameters in order to help dealing with the curse of dimensionality. This work started with the motivation to speed up the above model and to take advantage of prior knowledge e.g., in WordNet or in syntactically labeled data sets, and to better deal with polysemy. With the objective of reaching these goals, we present here a series of new statistical language models, most of which are yet untested.},
+topics={Markov,Language,Unsupervised},cat={T},
+}
+
+@TECHREPORT{TR1216,
+       author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien},
+        title = {Quick Training of Probabilistic Neural Nets by Importance Sampling},
+       number = {1216},
+         year = {2002},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1216.ps},
+     abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we proposed and evaluate sampling-based methods which require network passes only for the observed “positive example” and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.},
+topics={Markov,Language,Unsupervised},cat={T},
+}
+
+@TECHREPORT{TR1231,
+       author = {Bengio, Yoshua and Kermorvant, Christopher},
+        title = {Extracting Hidden Sense Probabilities from Bitexts},
+       number = {1231},
+         year = {2003},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1231.pdf},
+     abstract = {We propose a probabilistic model that is inspired by Diab & Resnik’s algorithm to extract disambiguation information from aligned bilingual texts. Like Diab & Resnik’s, the proposed model uses WordNet and the fact that word ambiguities are not always the same in the two languages. The generative model introduces a dependency between two translated words through a common ancestor inWordNet’s ontology. Unlike Diab & Resnik’s algorithm it does not suppose that the translation in the source language has a single meaning.},
+topics={Language},cat={T},
+}
+
+@TECHREPORT{TR1232,
+       author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois},
+        title = {Learning Eigenfunctions of Similarity: Linking Spectral Clustering and Kernel {PCA}},
+       number = {1232},
+         year = {2003},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1232.pdf},
+     abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a Hilbert space whose inner product is defined with respect to a density model. This suggests a new approach to unsupervised learning in which abstractions (such as manifolds and clusters) that represent the main features of the data density are extracted. Abstractions discovered at one level can be used to build higher-level abstractions. This paper also discusses how these abstractions can be used to recover a quantitative model of the input density, which is at least useful for evaluative and comparative purposes.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@TECHREPORT{TR1234,
+       author = {Bengio, Yoshua and Grandvalet, Yves},
+        title = {No Unbiased Estimator of the Variance of K-Fold Cros-Validation},
+       number = {1234},
+         year = {2003},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1234.pdf},
+     abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important for them to also estimate the uncertainty around the error (or error difference) estimate. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make na{\"{\i}}ve estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
+topics={Comparative},cat={T},
+}
+
+@TECHREPORT{tr1238,
+       author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal},
+        title = {Out-of-Sample Extensions for {LLE}, {I}somap, {MDS}, {E}igenmaps, and Spectral Clustering},
+       number = {1238},
+         year = {2003},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1238.pdf},
+     abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides algorithms for such an extension for Local Linear Embedding ({LLE}), Isomap, Laplacian Eigenmaps, Multi-Dimensional Scaling (all algorithms which provide lower-dimensional embedding for dimensionality reduction) as well as for Spectral Clustering (which performs non-Gaussian clustering). These extensions stem from a unified framework in which these algorithms are seen as learning eigenfunctions of a kernel. {LLE} and Isomap pose special challenges as the kernel is training-data dependent. Numerical experiments on real data show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms to the choice of training data.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@TECHREPORT{tr1239,
+       author = {Bengio, Yoshua and Vincent, Pascal and Paiement, Jean-Fran{\c c}ois and Delalleau, Olivier and Ouimet, Marie and Le Roux, Nicolas},
+        title = {Spectral Clustering and Kernel {PCA} are Learning Eigenfunctions},
+       number = {1239},
+         year = {2003},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1239.pdf},
+     abstract = {In this paper, we show a direct equivalence between spectral clustering and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of a kernel, when the functions are from a function space whose scalar product is defined with respect to a density model. This defines a natural mapping for new data points, for methods that only provided an embedding, such as spectral clustering and Laplacian eigenmaps. The analysis hinges on a notion of generalization for embedding algorithms based on the estimation of underlying eigenfunctions, and suggests ways to improve this generalization by smoothing the data empirical distribution.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@TECHREPORT{tr1240,
+       author = {Vincent, Pascal and Bengio, Yoshua},
+        title = {Locally Weighted Full Covariance Gaussian Density Estimation},
+       number = {1240},
+         year = {2003},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1240.pdf},
+     abstract = {We describe an interesting application of the principle of local learning to density estimation. Locally weighted fitting of a Gaussian with a regularized full covariance matrix yields a density estimator which displays improved behavior in the case where much of the probability mass is concentrated along a low dimensional manifold. While the proposed estimator is not guaranteed to integrate to 1 with a finite sample size, we prove asymptotic convergence to the true density. Experimental results illustrating the advantages of this estimator over classic non-parametric estimators are presented.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@TECHREPORT{tr1247,
+       author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas},
+        title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning},
+       number = {1247},
+         year = {2004},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1247.pdf},
+     abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed non-parametric algorithms which provide an estimated continuous label for the given unlabeled examples. It extends them to function induction algorithms that correspond to the minimization of a regularization criterion applied to an out-of-sample example, and happens to have the form of a Parzen windows regressor. The advantage of the extension is that it allows predicting the label for a new example without having to solve again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Experiments show that the extension works well, in the sense of predicting a label close to the one that would have been obtained if the test example had been included in the unlabeled set. This relatively efficient function induction procedure can also be used when n is large to approximate the solution by writing it only in terms of a kernel expansion with m << n terms, and reducing the linear system to m equations in m unknowns.},
+topics={Kernel,Unsupervised},cat={T},
+}
+
+@TECHREPORT{tr1250,
+       author = {Bengio, Yoshua and Monperrus, Martin},
+        title = {Discovering shared structure in manifold learning},
+       number = {1250},
+         year = {2004},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr-tangent.pdf},
+     abstract = {We claim and present arguments to the effect that a large class of manifold learning algorithms that are essentially local will suffer from at least four generic problems associated with (1) noise in the data, (2) curvature of the manifold, (3) dimensionality of the manifold, and (4) the presence of many manifolds with little data per manifold. This analysis suggests non-local manifold learning algorithms which attempt to discover shared structure in the tangent planes at different positions. A criterion for such an algorithm is proposed and experiments estimating a tangent plane prediction function are presented. The function has parameters that are shared across space rather than estimated based on the local neighborhood, as in current non-parametric manifold learning algorithms. The results show clearly the advantages of this approach with respect to local manifold learning algorithms.},
+topics={HighDimensional,Kernel,Unsupervised},cat={T},
+}
+
+@TECHREPORT{tr1252,
+       author = {Bengio, Yoshua and Larochelle, Hugo},
+        title = {Implantation et analyse d'un mod{\`{e}}le graphique {\`{a}} entra{\^{\i}}nement supervis{\'{e}}, semi-supervis{\'{e}} et non-supervis{\'{e}} pour la d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique},
+       number = {1252},
+         year = {2004},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1252.pdf},
+     abstract = {La d{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique est un sujet qui suscite beaucoup d’int{\'{e}}r{\^{e}}t dans la communaut{\'{e}} scientifique en apprentissage automatique. Quoique cette t{\^{a}}che ait {\'{e}}t{\'{e}} abord{\'{e}}e depuis les d{\'{e}}buts du traitement automatique de la langue, peu de progr{\`{e}}s ont {\'{e}}t{\'{e}} accomplis jusqu’{\`{a}} maintenant. Nous pr{\'{e}}sentons ici une application de d{\'{e}}sambigu{\"{\i}}sation bas{\'{e}}e sur un mod{\`{e}}le graphique probabiliste. Ce mod{\`{e}}le a {\'{e}}t{\'{e}} appris sur des donn{\'{e}}es {\'{e}}tiquet{\'{e}}es, non-{\'{e}}tiquet{\'{e}}es, et sur la hi{\'{e}}rarchie WordNet. Avec peu d’examples d’apprentissage, ses performances sont comparables {\`{a}} celles de l’algorithme de Bayes na{\"{\i}}f. Il pourrait {\'{e}}ventuellement {\^{e}}tre adapt{\'{e}} {\`{a}} des corpus bi-textes.},
+topics={Unsupervised,Language},cat={T},
+}
+
+@TECHREPORT{tr1281,
+       author = {Le Roux, Nicolas and Bengio, Yoshua},
+        title = {Continuous Neural Networks},
+       number = {1281},
+         year = {2006},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/continuous_nnet_tr1281.pdf},
+     abstract = {This article extends neural networks to the case of an uncountable number of hidden units, in several
+ways. In the first approach proposed, a finite parametrization is possible, allowing gradient-based
+learning. While having the same number of parameters as an ordinary neural network, its internal
+structure suggests that it can represent some smooth functions much more compactly. Under mild
+assumptions, we also find better error bounds than with ordinary neural networks. Furthermore, this
+parametrization may help reducing the problem of saturation of the neurons. In a second approach, the
+input-to-hidden weights are fully non-parametric, yielding a kernel machine for which we demonstrate
+a simple kernel formula. Interestingly, the resulting kernel machine can be made hyperparameter-free
+and still generalizes in spite of an absence of explicit regularization.},
+cat={T},topics={Kernel,HighDimensional},
+}
+
+@TECHREPORT{tr1282,
+       author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
+        title = {Greedy Layer-Wise Training of Deep Networks},
+       number = {1282},
+         year = {2006},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dbn_supervised_tr1282.pdf},
+     abstract = {Deep multi-layer neural networks have many levels of non-linearities, which allows them to potentially
+represent very compactly highly non-linear and highly-varying functions. However, until recently it
+was not clear how to train such deep networks, since gradient-based optimization starting from random
+initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy
+layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with
+many layers of hidden causal variables. In the context of the above optimization problem, we study
+this algorithm empirically and explore variants to better understand its success and extend it to cases
+where the inputs are continuous or where the structure of the input distribution is not revealing enough
+about the variable to be predicted in a supervised task.},
+cat={T},topics={HighDimensional,Unsupervised},
+}
+
+@TECHREPORT{tr1283,
+       author = {Carreau, Julie and Bengio, Yoshua},
+        title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tail Data},
+       number = {1283},
+         year = {2006},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/fat_tails_tr1283.pdf},
+     abstract = {We propose an estimator for the conditional density p(Y |X) that can adapt for asymmetric heavy tails
+which might depend on X. Such estimators have important applications in finance and insurance. We
+draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter
+controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been
+replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain
+a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional
+density estimator, the parameters of the mixture estimator can be seen as functions of X and these
+functions learned. We show experimentally that this approach better models the conditional density in
+terms of likelihood than compared competing algorithms: conditional mixture models with other types
+of components and multivariate nonparametric models.},
+cat={T},topics={Unsupervised,Mining},
+}
+
+@TECHREPORT{tr1284,
+       author = {Larochelle, Hugo and Bengio, Yoshua},
+        title = {Distributed Representation Prediction for Generalization to New Words},
+       number = {1284},
+         year = {2006},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/dist_rep_pred_tr1284.pdf},
+     abstract = {Learning distributed representations of symbols (e.g. words) has been used in several Natural Language Processing
+systems. Such representations can capture semantic or syntactic similarities between words, which permit to fight
+the curse of dimensionality when considering sequences of such words. Unfortunately, because these representations
+are learned only for a previously determined vocabulary of words, it is not clear how to obtain representations
+for new words. We present here an approach which gets around this problem by considering the distributed representations
+as predictions from low-level or domain-knowledge features of words. We report experiments on a Part
+Of Speech tagging task, which demonstrates the success of this approach in learning meaningful representations and
+in providing improved accuracy, especially for new words.},
+cat={T},topics={HighDimensional,Language},
+}
+
+@TECHREPORT{tr1285,
+       author = {Grandvalet, Yves and Bengio, Yoshua},
+        title = {Hypothesis Testing for Cross-Validation},
+       number = {1285},
+         year = {2006},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/xv_rho_stat_tr1285.pdf},
+     abstract = {K-fold cross-validation produces variable estimates, whose variance cannot be estimated unbiasedly. However, in practice, one would like to
+provide a figure related to the variability of this estimate. The first part
+of this paper lists a series of restrictive assumptions (on the distribution of
+cross-validation residuals) that allow to derive unbiased estimates. We exhibit three such estimates, corresponding to differing assumptions. Their
+similar expressions however entail almost identical empirical behaviors.
+Then, we look for a conservative test for detecting significant differences
+in performances between two algorithms. Our proposal is based on the
+derivation of the form of a t-statistic parametrized by the correlation of
+residuals between each validation set. Its calibration is compared to the
+usual t-test. While the latter is overconfident in concluding that differences are indeed significant, our test is bound to be more skeptical, with
+smaller type-I error.},
+cat={T},topics={ModelSelection,Comparative},
+}
+
+@TECHREPORT{tr1286,
+       author = {Erhan, Dumitru and Bengio, Yoshua and {L'Heureux}, Pierre-Jean and Yue, Shi Yi},
+        title = {Generalizing to a Zero-Data Task: a Computational Chemistry Case Study},
+       number = {1286},
+         year = {2006},
+  institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~lisa/pointeurs/mt_qsar_tr1286.pdf},
+     abstract = {We investigate the problem of learning several tasks simultaneously in order to transfer the acquired
+knowledge to a completely new task for which no training data are available. Assuming that the tasks
+share some representation that we can discover efficiently, such a scenario should lead to a better model of
+the new task, as compared to the model that is learned by only using the knowledge of the new task. We
+have evaluated several supervised learning algorithms in order to discover shared representations among
+the tasks defined in a computational chemistry/drug discovery problem. We have cast the problem from
+a statistical learning point of view and set up the general hypotheses that have to be tested in order
+to validate the multi-task learning approach. We have then evaluated the performance of the learning
+algorithms and showed that it is indeed possible to learn a shared representation of the tasks that allows
+to generalize to a new task for which no training data are available. From a theoretical point of view,
+our contribution also comprises a modification to the Support Vector Machine algorithm, which can
+produce state-of-the-art results using multi-task learning concepts at its core. From a practical point
+of view, our contribution is that this algorithm can be readily used by pharmaceutical companies for
+virtual screening campaigns.},
+cat={T},topics={MultiTask,Kernel,Bioinformatic},
+}
+
+@INPROCEEDINGS{Turian+al-2009,
+     author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua},
+      title = {Quadratic Features and Deep Architectures for Chunking},
+  booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)},
+       year = {2009},
+   abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
+}
+
+@INPROCEEDINGS{Turian+al-2010,
+     author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua and Roth, Dan},
+      title = {A preliminary evaluation of word representations for named-entity recognition},
+  booktitle = {NIPS Workshop on Grammar Induction, Representation of Language and Language Learning},
+       year = {2009},
+        url = {http://www.iro.umontreal.ca/~lisa/pointeurs/wordrepresentations-ner.pdf},
+   abstract = {We use different word representations as word features for a named-entity recognition (NER) system with a linear model. This work is part of a larger empirical survey, evaluating different word representations on different NLP tasks. We evaluate Brown clusters, Collobert and Weston (2008) embeddings, and HLBL (Mnih & Hinton, 2009) embeddings of words. All three representations improve accuracy on NER, with the Brown clusters providing a larger improvement than the two embeddings, and the HLBL embeddings more than the Collobert and Weston (2008) embeddings. We also discuss some of the practical issues in using embeddings as features. Brown clusters are simpler than embeddings because they require less hyperparameter tuning.}
+}
+
+@INPROCEEDINGS{Turian+Ratinov+Bengio-2010,
+     author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
+      title = {Word representations: A simple and general method for semi-supervised learning},
+  booktitle = {Association for Computational Linguistics(ACL2010)},
+       year = {2010}
+}
+
+@INPROCEEDINGS{Vincent-Bengio-2003,
+    author = {Vincent, Pascal and Bengio, Yoshua},
+     title = {Manifold Parzen Windows},
+      year = {2003},
+     pages = {825--832},
+  crossref = {NIPS15-shorter},
+  abstract = {The similarity between objects is a fundamental element of many learning algorithms. Most non-parametric methods take this similarity to be fixed, but much recent work has shown the advantages of learning it, in particular to exploit the local invariances in the data or to capture the possibly non-linear manifold on which most of the data lies. We propose a new non-parametric kernel density estimation method which captures the local structure of an underlying manifold through the leading eigenvectors of regularized local covariance matrices. Experiments in density estimation show significant improvements with respect to Parzen density estimators. The density estimators can also be used within Bayes classifiers, yielding classification rates similar to {SVM}s and much superior to the Parzen classifier.},
+topics={HighDimensional,Kernel,Unsupervised},cat={C},
+}
+
+@TECHREPORT{Vincent-TR1316,
+       author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
+        title = {Extracting and Composing Robust Features with Denoising Autoencoders},
+       number = {1316},
+         year = {2008},
+  institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+          url = {http://www.iro.umontreal.ca/~vincentp/Publications/denoising_autoencoders_tr1316.pdf},
+     abstract = {Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite.}
+}
+
+@PHDTHESIS{Vincent2003,
+    author = {Vincent, Pascal},
+     title = {Mod{\`{e}}les {\`{a}} Noyaux {\`{a}} Structure Locale},
+      year = {2003},
+    school = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+}
+
+@ARTICLE{vincent:2001,
+    author = {Vincent, Pascal and Bengio, Yoshua},
+     title = {Kernel Matching Pursuit},
+   journal = {Machine Learning},
+      year = {2001},
+  abstract = {We show how Matching Pursuit can be used to build kernel-based solutions to machine-learning problems while keeping control of the sparsity of the solution, and how it can be extended to use non-squared error loss functions. We also deriveMDL motivated generalization bounds for this type of algorithm. Finally, links to boosting algorithms and {RBF} training procedures, as well as extensive experimental comparison with {SVM}s are given, showing comparable results with typically sparser models.},
+topics={HighDimensional,Kernel},cat={J},
+}
+
+@INPROCEEDINGS{VincentPLarochelleH2008,
+    author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
+     title = {Extracting and Composing Robust Features with Denoising Autoencoders},
+      year = {2008},
+     pages = {1096--1103},
+  crossref = {ICML08-shorter},
+  abstract = {Recently, many applications for Restricted {Boltzmann} Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization
+for deep feed-forward neural network classifiers, and are not considered as a standalone solution to classification problems. In
+this paper, we argue that RBMs provide a self-contained framework for deriving competitive non-linear classifiers. We present an evaluation of different learning algorithms for
+RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This
+approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semi-supervised setting.}
+}
+
+@TECHREPORT{visualization_techreport,
+       author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
+        title = {Visualizing Higher-Layer Features of a Deep Network},
+       number = {1341},
+         year = {2009},
+  institution = {University of Montreal},
+     abstract = {Deep architectures have demonstrated state-of-the-art results in a variety of
+settings, especially with vision datasets. Beyond the model definitions and the quantitative analyses, there is a need for qualitative comparisons of the solutions learned by various deep architectures. The goal of this paper is to find good qualitative interpretations of high level features represented by such models. To this end, we contrast and compare several techniques applied on Stacked Denoising Autoencoders and Deep Belief Networks, trained on several vision datasets. We show that, perhaps counter-intuitively, such interpretation is possible at the unit level, that it is simple to accomplish and that the results are consistent across various techniques. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work}
+}
+
+@INPROCEEDINGS{xAISTATS2009-short,
+      title = {Proc. AISTATS'2009},
+  booktitle = {Proc. AISTATS'2009},
+       year = {2009}
+}
+
+
+@MISC{Yoshua+al-snowbird-2008,
+        author = {Bengio, Yoshua and Larochelle, Hugo and Turian, Joseph},
+         title = {Deep Woods},
+          year = {2008},
+  howpublished = {Poster presented at the Learning@Snowbird Workshop, Snowbird, USA, 2008}
+}
+
+@ARTICLE{Zaccaro-et-al-2005,
+    author = {Zaccaro, Maria Clara and Boon, Hong and Pattarawarapan, Mookda and Xia, Zebin and Caron, Antoine and {L'Heureux}, Pierre-Jean and Bengio, Yoshua and Burgess, Kevin and Saragori, H. Uri},
+     title = {Selective Small Molecule Peptidomimetic Ligands of TrkC and TrkA Receptors Afford Discrete or Complete Neurotrophic Activities},
+   journal = {Chemistry \& Biology},
+    volume = {12},
+    number = {9},
+      year = {2005},
+     pages = {1015--1028}
+}
+
+
+
+crossreferenced publications: 
+@INPROCEEDINGS{ICML09,
+     editor = {Bottou, {L{\'{e}}on} and Littman, Michael},
+      title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
+  booktitle = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)},
+       year = {-1},
+  publisher = {ACM}
+}
+
+@INPROCEEDINGS{NIPS7,
+     editor = {Tesauro, G. and Touretzky, D. S. and Leen, T. K.},
+      title = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
+  booktitle = {Advances in Neural Information Processing Systems 7 (NIPS'94)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS6,
+     editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
+      title = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
+  booktitle = {Advances in Neural Information Processing Systems 6 (NIPS'93)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS8,
+     editor = {Touretzky, D. S. and Mozer, M. and Hasselmo, M.E.},
+      title = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
+  booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@ARTICLE{JMLR,
+   journal = {Journal of Machine Learning Research},
+      year = {-1}
+}
+
+@INPROCEEDINGS{NIPS19,
+     editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
+      title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
+  booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS10,
+     editor = {Jordan, M.I. and Kearns, M.J. and Solla, S.A.},
+      title = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
+  booktitle = {Advances in Neural Information Processing Systems 10 (NIPS'97)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS1,
+     editor = {Touretzky, D. S.},
+      title = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
+  booktitle = {Advances in Neural Information Processing Systems 1 (NIPS'88)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{NIPS2,
+     editor = {Touretzky, D. S.},
+      title = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
+  booktitle = {Advances in Neural Information Processing Systems 2 (NIPS'89)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{NIPS4,
+     editor = {Moody, J. E. and Hanson, S. J. and Lipmann, R. P.},
+      title = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
+  booktitle = {Advances in Neural Information Processing Systems 4 (NIPS'91)},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+
+@INPROCEEDINGS{NIPS12,
+     editor = {Solla, S.A. and Leen, T. K.},
+      title = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
+  booktitle = {Advances in Neural Information Processing Systems 12 (NIPS'99)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS16,
+     editor = {Becker, S. and Saul, L. and {Sch{\"{o}}lkopf}, Bernhard},
+      title = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
+  booktitle = {Advances in Neural Information Processing Systems 16 (NIPS'03)},
+       year = {-1}
+}
+
+@INPROCEEDINGS{NIPS22,
+     editor = {Bengio, Yoshua and Schuurmans, Dale and Williams, Christopher and Lafferty, John and Culotta, Aron},
+      title = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
+  booktitle = {Advances in Neural Information Processing Systems 22 (NIPS'09)},
+       year = {-1}
+}
+
+@INPROCEEDINGS{NIPS20,
+     editor = {Platt, John and Koller, D. and Singer, Yoram and Roweis, S.},
+      title = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
+  booktitle = {Advances in Neural Information Processing Systems 20 (NIPS'07)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{xAISTATS2009,
+      title = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
+  booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
+       year = {2009},
+}
+
+@INPROCEEDINGS{NIPS9,
+     editor = {Mozer, M. and Jordan, M.I. and Petsche, T.},
+      title = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
+  booktitle = {Advances in Neural Information Processing Systems 9 (NIPS'96)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS17,
+     editor = {Saul, Lawrence K. and Weiss, Yair and Bottou, {L{\'{e}}on}},
+      title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
+  booktitle = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
+       year = {-1}
+}
+
+@INPROCEEDINGS{ICML08,
+     editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
+      title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
+  booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
+       year = {-1},
+  publisher = {ACM}
+}
+
+@INPROCEEDINGS{ICML07,
+     editor = {Ghahramani, Zoubin},
+      title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
+  booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
+       year = {-1},
+  publisher = {ACM}
+}
+
+@TECHREPORT{DIRO,
+        title = {DIRO},
+         year = {-1},
+  institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
+}
+
+@INPROCEEDINGS{NIPS18,
+     editor = {Weiss, Yair and {Sch{\"{o}}lkopf}, Bernhard and Platt, John},
+      title = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
+  booktitle = {Advances in Neural Information Processing Systems 18 (NIPS'05)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{NIPS13,
+     editor = {Leen, T. K. and Dietterich, T.G.},
+      title = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
+  booktitle = {Advances in Neural Information Processing Systems 13 (NIPS'00)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{ICML05,
+     editor = {Raedt, Luc De and Wrobel, Stefan},
+      title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
+  booktitle = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)},
+       year = {-1},
+  publisher = {ACM}
+}
+
+@INPROCEEDINGS{ICML06,
+     editor = {Cohen, William W. and Moore, Andrew},
+      title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
+  booktitle = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)},
+       year = {-1},
+  publisher = {ACM}
+}
+
+@INPROCEEDINGS{NIPS15,
+     editor = {Becker, S. and Thrun, Sebastian},
+      title = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
+  booktitle = {Advances in Neural Information Processing Systems 15 (NIPS'02)},
+       year = {-1},
+  publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{ICML01-shorter,
+      title = {ICML'01},
+  booktitle = {ICML'01},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{ICML02-shorter,
+      title = {ICML'02},
+  booktitle = {ICML'02},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{ICML03-shorter,
+      title = {ICML'03},
+  booktitle = {ICML'03},
+       year = {-1},
+  publisher = {AAAI Press}
+}
+@INPROCEEDINGS{ICML04-shorter,
+      title = {ICML'04},
+  booktitle = {ICML'04},
+       year = {-1},
+  publisher = {ACM}
+}
+@INPROCEEDINGS{ICML05-shorter,
+      title = {ICML'05},
+  booktitle = {ICML'05},
+       year = {-1},
+  publisher = {ACM}
+}
+@INPROCEEDINGS{ICML06-shorter,
+      title = {ICML'06},
+  booktitle = {ICML'06},
+       year = {-1},
+  publisher = {ACM}
+}
+@INPROCEEDINGS{ICML07-shorter,
+      title = {ICML'07},
+  booktitle = {ICML'07},
+       year = {-1},
+  publisher = {ACM}
+}
+@INPROCEEDINGS{ICML08-shorter,
+      title = {ICML'08},
+  booktitle = {ICML'08},
+       year = {-1},
+  publisher = {ACM}
+}
+@INPROCEEDINGS{ICML09-shorter,
+      title = {ICML'09},
+  booktitle = {ICML'09},
+       year = {-1},
+  publisher = {ACM}
+}
+@INPROCEEDINGS{ICML96-shorter,
+      title = {ICML'96},
+  booktitle = {ICML'96},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{ICML97-shorter,
+      title = {ICML'97},
+  booktitle = {ICML'97},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{ICML98-shorter,
+      title = {ICML'98},
+  booktitle = {ICML'98},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{ICML99-shorter,
+      title = {ICML'99},
+  booktitle = {ICML'99},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@ARTICLE{JMLR-shorter,
+   journal = {JMLR},
+      year = {-1}
+}
+@INPROCEEDINGS{NIPS1-shorter,
+      title = {NIPS'88},
+  booktitle = {NIPS 1},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{NIPS10-shorter,
+      title = {NIPS'97},
+  booktitle = {NIPS 10},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS11-shorter,
+      title = {NIPS'98},
+  booktitle = {NIPS 11},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS12-shorter,
+      title = {NIPS'99},
+  booktitle = {NIPS 12},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS13-shorter,
+      title = {NIPS'00},
+  booktitle = {NIPS 13},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS14-shorter,
+      title = {NIPS'01},
+  booktitle = {NIPS 14},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS15-shorter,
+      title = {NIPS'02},
+  booktitle = {NIPS 15},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS16-shorter,
+      title = {NIPS'03},
+  booktitle = {NIPS 16},
+       year = {-1}
+}
+@INPROCEEDINGS{NIPS17-shorter,
+      title = {NIPS'04},
+  booktitle = {NIPS 17},
+       year = {-1}
+}
+@INPROCEEDINGS{NIPS18-shorter,
+      title = {NIPS'05},
+  booktitle = {NIPS 18},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS19-shorter,
+      title = {NIPS'06},
+  booktitle = {NIPS 19},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS2-shorter,
+      title = {NIPS'89},
+  booktitle = {NIPS 2},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{NIPS20-shorter,
+      title = {NIPS'07},
+  booktitle = {NIPS 20},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS21-shorter,
+      title = {NIPS'08},
+  booktitle = {NIPS 21},
+       year = {-1},
+  publisher = {Nips Foundation (http://books.nips.cc)}
+}
+@INPROCEEDINGS{NIPS22-shorter,
+      title = {NIPS'09},
+  booktitle = {NIPS 22},
+       year = {-1}
+}
+@INPROCEEDINGS{NIPS3-shorter,
+      title = {NIPS'90},
+  booktitle = {NIPS 3},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{NIPS4-shorter,
+      title = {NIPS'91},
+  booktitle = {NIPS 4},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{NIPS5-shorter,
+      title = {NIPS'92},
+  booktitle = {NIPS 5},
+       year = {-1},
+  publisher = {Morgan Kaufmann}
+}
+@INPROCEEDINGS{NIPS6-shorter,
+      title = {NIPS'93},
+  booktitle = {NIPS 6},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS7-shorter,
+      title = {NIPS'94},
+  booktitle = {NIPS 7},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS8-shorter,
+      title = {NIPS'95},
+  booktitle = {NIPS 8},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{NIPS9-shorter,
+      title = {NIPS'96},
+  booktitle = {NIPS 9},
+       year = {-1},
+  publisher = {MIT Press}
+}
+@INPROCEEDINGS{xAISTATS2009-shorter,
+      title = {AISTATS'2009},
+  booktitle = {AISTATS'2009},
+       year = {-1}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/ift6266_ml.bib	Tue Jun 01 12:13:10 2010 -0400
@@ -0,0 +1,25821 @@
+%%WARNING: READ THE README FILE BEFORE ANY MODIFICATION!!!
+
+
+%%submitted papers
+%%%
+
+@Article{Bergstra+Bengio+Louradoj-2008sub,
+  author =       "J. Bergstra and Y. Bengio and J. Louradour",
+  title =        "Suitability of Complex Cell Models for Object Categorization",
+  journal =      "Computational Neuroscience",
+  year =         "2008",
+  note =         "Rejected."
+}
+@Article{Bergstra+Bengio+Louradoj-2009sub,
+  author =       "J. Bergstra and Y. Bengio and J. Louradour",
+  title =        "Suitability of Complex Cell Models for Object Categorization",
+  journal =      "Neural Computation",
+  year =         "2009",
+  note =         "Submitted."
+}
+@Article{Chapados+Bengio-2008sub,
+  author =       "N. Chapados and Y. Bengio",
+  title =        "Forecasting and Trading Commodity Contract Spreads with {G}aussian Processes",
+  journal =      "International Journal of Forecasting",
+  year =         "2008",
+  note = "Submitted.",
+}
+@Article{Chapados+Bengio-2008sub2,
+  author =       "N. Chapados and Y. Bengio",
+  title =        "Training Graphs of Learning Modules for Sequential Data",
+  journal =      "ACM Transactions on Knowledge Discovery from Data",
+  year =         "2008",
+  note = "Submitted.",
+}
+
+%%%
+%%accepted or published papers
+%%%
+
+@Article{Grother,
+  author = "Grother Patrick J.",
+  title = "NIST special database. Handprinted forms and characters database",
+  publisher = "National institute of standards and technology",
+  year = "1995"
+}
+
+@InCollection{Trentin+al-2002,
+  author =       "E. Trentin and F. Brugnara and Y. Bengio and C. Furlanello and R.  De Mori",
+  editor =       "R. Daniloff",
+  booktitle =    "Connectionist Approaches to Clinical Problems in Speech
+and Language",
+  title =        "Statistical and Neural Network Models for Speech Recognition",
+  publisher =    "Lawrence Erlbaum",
+  pages =        "213--264",
+  year =         "2002",
+}
+
+@InCollection{Bengio+grandvalet-2004,
+  author =       "Y. Bengio and Y. Grandvalet",
+  editor =       "P. Duchesne and B. Remillard",
+  booktitle =    "Statistical Modeling and Analysis for Complex Data Problem",
+  title =        "Bias in Estimating the Variance of K-Fold Cross-Validation",
+  publisher =    "Lawrence Erlbaum",
+  address =      "Kluwer",
+  pages =        "75--95",
+  year =         "2004",
+}
+
+@InCollection{Dugas+al-2004,
+  author =       "C. Dugas and Y. Bengio and N. Chapados and P. Vincent and G. Denoncourt and C. Fournier",
+  editor =       "L. Jain and A.F. Shapiro",
+  booktitle =    "Intelligent and Other Computational Techniques in Insurance: Theory and
+Applications",
+  title =        "Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking",
+  publisher =    "World Scientific Publishing Company",
+  year =         "2004",
+}
+
+@InCollection{Dugas+al-2004-short,
+  author =       "C. Dugas and Y. Bengio and N. Chapados and P. Vincent and G. Denoncourt and C. Fournier",
+  booktitle =    "Intelligent and Other Computational Techniques in Insurance: Theory and
+Applications",
+  title =        "Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking",
+  publisher =    "World Scientific Publishing Company",
+  year =         "2004",
+}
+
+@inproceedings{Collobert+Bengio+Bengio-2002b,
+    author = "R. Collobert and Y. Bengio and S. Bengio",
+    title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
+    editor = "S.W. Lee and A. Verri",
+    year = 2002,
+    booktitle = SVM02,
+    volume = "2388 of Lecture Notes in Computer Science",
+    publisher = "Springer-Verlag",
+    pages = "8--23",
+}
+
+@Article{Collobert+Bengio+Bengio-2003,
+  author =       "R. Collobert and Y. Bengio and S. Bengio.",
+  title =        "Scaling Large Learning Problems with Hard Parallel Mixtures",
+  journal =      ijprai,
+  volume =       "17",
+  number =       "3",
+  pages =        "349--365",
+  year =         "2003",
+}
+
+@Article{Collobert+Bengio+Bengio-2003-small,
+  author =       "R. Collobert and Y. Bengio and S. Bengio.",
+  title =        "Scaling Large Learning Problems with Hard Parallel Mixtures",
+  journal =      "Int. J. Pattern Recognition and Artificial Intelligence",
+  volume =       "17(3)",
+  pages =        "349--365",
+  year =         "2003",
+}
+
+@InProceedings{Bengio+Chapados-2002,
+  author =       "Y. Bengio and N. Chapados",
+  title =        "Metric-based Model Selection for Time-Series Forecasting",
+  publisher =    "IEEE Press",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  year =         "2002",
+  pages = "13--24",
+}
+
+@InProceedings{Bengio+Takeuchi+Kanamori-2002,
+  author =       "Y. Bengio and I. Takeuchi and K. Kanamori",
+  title =        "The Challenge of Non-Linear Regression on Large Datasets with Asymmetric Heavy Tails",
+  publisher =    "American Statistical Association publ.",
+  booktitle =    JSM02,
+  year =         "2002",
+  pages = "193-205"
+}
+
+@InProceedings{Bengio+Takeuchi+Kanamori-2002-short,
+  author =       "Y. Bengio and I. Takeuchi and K. Kanamori",
+  title =        "The Challenge of Non-Linear Regression on Large Datasets with Asymmetric Heavy Tails",
+  booktitle =    JSM02,
+  year =         "2002",
+}
+
+@InProceedings{Collobert+Bengio+Bengio-2002,
+  author =       "R. Collobert ans S. Bengio and Y. Bengio",
+  title =        "A Parallel Mixture of {SVM}s for Very Large Scale Problems",
+  booktitle =    NIPS14,
+  editor =       NIPS14ed,
+  pages =        "633--640",
+  year =         "2002",
+}
+
+@InProceedings{Bhattacharya+Getoor+Bengio-2004,
+  author =       "I. Bhattacharya and L. Getoor and Y. Bengio",
+  booktitle =    "Conference of the Association for Computational Linguistics (ACL'04)",
+  title =        "Unsupervised Sense Disambiguation Using Bilingual Probabilistic Models",
+  year =         "2004",
+}
+@InProceedings{Boufaden+Bengio+Lapalme-2008,
+  author =       "N. Boufaden and Y. Bengio and G. Lapalme",
+  booktitle =    "{\em TALN'2004}, Traitement Automatique du Langage Naturel.",
+  title =        "Approche statistique pour le repérage de mots informatifs dans les textes oraux",
+  year =         "2004",
+}
+@InProceedings{Chapados+Bengio-2006,
+  author =       "N. Chapados and Y. Bengio",
+  booktitle =    AI06,
+  title =        "The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization",
+  pages =        "491-502",
+  year =         "2006",
+}
+@InProceedings{Rivest+Bengio+Kalaska-2005,
+  author =       "F. Rivest and Y. Bengio and J. Kalaska",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Brain Inspired Reinforcement Learning",
+  publisher =    "MIT Press, Cambridge",
+  address =      "Cambridge, MA",
+  pages =        "1129-1136",
+  year =         "2005",
+}
+
+@InProceedings{Bengio+Grandvalet-NIPS-2004,
+  author =       "Y. Bengio Y. and Y. Grandvalet",
+  editor =       NIPS16ed,
+  booktitle =    NIPS16,
+  title =        "No Unbiased Estimator of the Variance of K-Fold Cross-Validation",
+  publisher =    "MIT Press, Cambridge",
+  address =      "Cambridge, MA",
+  year =         "2004",
+}
+
+@InProceedings{Bengio+Grandvalet-NIPS-2004-short,
+  author =       "Y. Bengio Y. and Y. Grandvalet",
+  booktitle =    NIPS16,
+  title =        "No Unbiased Estimator of the Variance of K-Fold Cross-Validation",
+  publisher =    "MIT Press, Cambridge",
+  year =         "2004",
+}
+
+@article{Zaccaro-et-al-2005,
+ author = {Maria Clara Zaccaro and Hong Boon Lee and Mookda Pattarawarapan and 
+           Zebin Xia and Antoine Caron and Pierre-Jean L'Heureux and Yoshua Bengio
+           and Kevin Burgess and H. Uri Saragovi},
+ title = {Selective Small Molecule Peptidomimetic Ligands of {TrkC} and {TrkA} Receptors Afford Discrete or Complete Neurotrophic Activities},
+ journal = {Chemistry \& Biology},
+ volume = 12,
+ number = 9,
+ pages = {1015--1028},
+ year = 2005,
+}
+
+@Article{63a:man,
+  author =       "B. Mandelbrot",
+  title =        "The variation of certain speculative prices",
+  journal =      "Journal of Business",
+  volume =       "36",
+  pages =        "394--419",
+  year =         "1963",
+  annote =       "Référence pour les distributions stables en finance",
+}
+
+@Article{65a:fam,
+  author =       "E. F. Fama",
+  title =        "The behavior of stock market prices",
+  journal =      "Journal of Business",
+  volume =       "38",
+  pages =        "34--105",
+  year =         "1965",
+  annote =       "Autre référence pour les distributions stables en
+                 finance",
+}
+
+@Article{96a:cor:gon:har,
+  author =       "R. M. Corless and G. H. Gonnet and D. E. G. Hare and
+                 D. J. Jeffrey and D. E. Knuth",
+  title =        "On the {Lambert} {W} Function",
+  journal =      "Advances in Computational Mathematics",
+  volume =       "5",
+  pages =        "329--359",
+  year =         "1996",
+  annote =       "Sert à résoudre les équations où une variable et son
+                 logarithme (ou exponentielle) apparaissent
+                 simultanément",
+}
+
+@Book{97b:emb:klu:mik,
+  author =       "P. Embrechts and C. Kluppelberg and T. Mikosch",
+  title =        "Modelling Extremal Events",
+  publisher =    "Springer",
+  year =         "1997",
+  series =       "Applications of Mathematics, Stochastic Modelling and
+                 Applied Probability",
+  annote =       "book on evt: theory, statistical methods for gev",
+}
+
+@Article{99a:kan:ser,
+  author =       "S. Kang and R. F. Serfozo",
+  title =        "Extreme values of phase-type and mixed random
+                 variables with parallel-processing examples",
+  journal =      "Journal of Applied Probability",
+  volume =       "36",
+  pages =        "194--210",
+  year =         "1999",
+  annote =       "limiting distribution of the maximum of r.v. i.i.d
+                 from a mixture is determined by the component of the
+                 mixture that has a dominant tail",
+}
+
+@TechReport{Abdallah+Plumbley-06,
+  author =       "Samer Abdallah and Mark Plumbley",
+  title =        "Geometry Dependency Analysis",
+  number =       "C4DM-TR06-05",
+  institution =  "Center for Digital Music, Queen Mary, University of
+                 London",
+  year =         "2006",
+}
+
+@Article{Abe+Warmuth92,
+  author =       "N. Abe and M. K. Warmuth",
+  title =        "On the Computational Complexity of Approximating
+                 Distributions by Probabilistic Automata",
+  journal =      "Machine Learning",
+  volume =       "9",
+  month =        jul,
+  year =         "1992",
+}
+
+@Article{Abu-Mostafa-hints,
+  author =       "Y. S. Abu-Mostafa",
+  title =        "Learning from Hints in Neural Networks",
+  journal =      jcomp,
+  volume =       "6",
+  pages =        "192--198",
+  year =         "1990",
+}
+
+@Article{Abu-Mostafa87,
+  author =       "Y. S. Abu-Mostafa and D. Psaltis",
+  title =        "Optical Neural Computers",
+  journal =      sciam,
+  volume =       "256",
+  pages =        "88--95",
+  month =        mar,
+  year =         "1987",
+}
+
+@Article{Abu-Mostafa89,
+  author =       "Y. S. Abu-Mostafa",
+  title =        "The {Vapnik}-{Chervonenkis} Dimension: Information
+                 versus Complexity in Learning",
+  journal =      nc,
+  volume =       "1",
+  pages =        "312--317",
+  year =         "1989",
+}
+
+@Article{abumostafa95,
+  author =       "Yaser S. Abu-Mostafa",
+  title =        "Hints",
+  journal =      "Neural Computation",
+  volume =       "7",
+  number =       "4",
+  pages =        "639--671",
+  month =        jul,
+  year =         "1995",
+}
+
+@misc{Ackerman+BenDavid-2008,
+    author = "Margareta Ackerman and Shai Ben-David",
+    title = "Clustering Quality Measures",
+    year = 2008,
+    note = "{\em Snowbird Learning Workshop}",
+}
+
+@Article{Ackley85,
+  author =       "D. H. Ackley and G. E. Hinton and T. J. Sejnowski",
+  title =        "A Learning Algorithm for {Boltzmann} Machines",
+  journal =      cogsci,
+  volume =       "9",
+  pages =        "147--169",
+  year =         "1985",
+}
+
+@InProceedings{Ackley90,
+  author =       "D. H. Ackley and M. S. Littman",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Generalization and Scaling in Reinforcement Learning",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "550--557",
+  year =         "1990",
+}
+
+@Article{ACM:Rohwer94,
+  author =       "R. Rohwer",
+  title =        "The time dimension of neural network models",
+  journal =      "ACM Sigart Bulleting",
+  volume =       "5",
+  number =       "3",
+  pages =        "36--44",
+  month =        jul,
+  year =         "1994",
+}
+
+@article{AdelsonBergen1985,
+    author={E. H. Adelson and J. R. Bergen},
+    title={Spatiotemporal Energy Models for the Perception of Motion},
+    journal={Journal of the Optical Society of America},
+    volume=2,
+    number=2,
+    year=1985,
+    pages={284-99},
+}
+
+@Article{Agrawala70,
+  author = 	 {Ashok Kumar Agrawala},
+  title = 	 {Learning with a Probabilistic Teacher},
+  journal = 	 {IEEE Transactions on Information Theory},
+  year = 	 1970,
+  volume =	 16,
+  pages =	 {373-379}
+}
+
+@Article{Ahalt90,
+  author =       "S. C. Ahalt and A. K. Krishnamurthy and P. Chen and D.
+                 E. Melton",
+  title =        "Competitive Learning Algorithms for Vector
+                 Quantization",
+  journal =      nn,
+  volume =       "3",
+  pages =        "277--290",
+  year =         "1990",
+}
+
+@InProceedings{Ahmad93,
+  author =       "S. Ahmad and V. Tresp",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Some Solutions to the Missing Feature Problem in
+                 Vision",
+  publisher =    "Morgan Kaufman Publishers",
+  address =      "San Mateo, CA",
+  year =         "1993",
+}
+
+@inproceedings{Ahmed2008,
+ author = {Amr Ahmed and Kai Yu and Wei Xu and Yihong Gong and Eric P. Xing},
+ booktitle = {Proceedings of the 10th European Conference on Computer Vision (ECCV'08)},
+ title = {Training Hierarchical Feed-forward Visual Recognition Models Using Transfer Learning from Pseudo Tasks},
+ year = 2008,
+ pages = "69--82",
+}
+
+@article{AitchisonJ1976,
+	author = {John Aitchison and Colin Aitken},
+	journal = {Biometrika},
+	number = {3},
+	pages = {413--420},
+	title = {Multivariate binary discrimination by the kernel method},
+	volume = {63},
+	year = {1976}
+}
+
+@Article{Aizerman64,
+  author =       "Mark A. Aizerman and Emmanuel M. Braverman and Lev I.
+                 Rozonoer",
+  title =        "Theoretical Foundations of the Potential Function
+                 Method in Pattern Recognition Learning",
+  journal =      "Automation and Remote Control",
+  volume =       "25",
+  pages =        "821--837",
+  year =         "1964",
+}
+
+@Article{Ajtai83,
+  author =       "Miklos Ajtai",
+  title =        "$\sum_1^1$-formulae on finite structures",
+  journal =      "Annals of Pure and Applied Logic",
+  volume =       "24",
+  number =      "1",
+  pages =        "1--48",
+  year =         "1983",
+}
+
+@Article{Akaike74,
+  author =       "H. Akaike",
+  title =        "A New Look at the Statistical Model Identification",
+  journal =      ieeeac,
+  volume =       "AC-19",
+  number =       "6",
+  pages =        "716--728",
+  year =         "1974",
+}
+
+@Article{Al-Mashouq-hints,
+  author =       "K. A. Al-Mashouq and I. S. Reed",
+  title =        "Including Hints in Training Neural Nets",
+  journal =      nc,
+  volume =       "3",
+  number =       "4",
+  pages =        "418--430",
+  year =         "1991",
+}
+
+@Book{Aleksander:90,
+  author =       "I. Aleksander and H. Morton",
+  title =        "An Introduction to Neural Computing",
+  publisher =    "Chapman and Hall",
+  address =      "London",
+  year =         "1990",
+  keywords =     "",
+}
+
+@InProceedings{Aleksander:93,
+  author =       "I. Aleksander and H. Morton",
+  editor =       "J. Mira and J. Cabestany and A. Prieto",
+  booktitle =    "New Trends in Neural Computation: Proc. of the
+                 International Workshop on Artificial Neural Networks
+                 IWANN'93",
+  title =        "A Neural State Machine for Iconic Language
+                 Representation",
+  publisher =    "Springer",
+  address =      "Berlin, Heidelberg",
+  pages =        "84--89",
+  year =         "1993",
+  keywords =     "",
+}
+
+@InProceedings{Allender96,
+  author =       "Eric Allender",
+  booktitle =    "16th Annual Conference on Foundations of Software
+                 Technology and Theoretical Computer Science",
+  title =        "Circuit Complexity Before the Dawn of the New
+                 Millennium",
+  publisher =    "Lecture Notes in Computer Science 1180, Springer
+                 Verlag",
+  pages =        "1--18",
+  year =         "1996",
+}
+
+@InProceedings{Alleva93,
+  author =       "F. Alleva and X. Huang and M. Y. Hwang",
+  booktitle =    icassp,
+  title =        "An improved search algorithm using incremental
+                 knowledge for continuous speech recognition",
+  address =      "Minneapolis, Minnesota",
+  pages =        "307--310",
+  year =         "1993",
+}
+
+@Book{Allgower80,
+  author =       "E. L. Allgower and K. Georg",
+  title =        "Numerical Continuation Methods. {A}n Introduction",
+  number =       "13",
+  publisher =    "Springer-Verlag",
+  year =         "1980",
+  series =       "Springer Series in Computational Mathematics",
+}
+
+@Book{Allgower80-short,
+  author =       "E. L. Allgower and K. Georg",
+  title =        "Numerical Continuation Methods. {A}n Introduction",
+  publisher =    "Springer-Verlag",
+  year =         "1980",
+}
+
+@InProceedings{Almeida87,
+  author =       "L. B. Almeida",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "A Learning Rule for Asynchronous Perceptrons with
+                 Feedback in a Combinatorial Environment",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "609--618",
+  year =         "1987",
+}
+
+@InProceedings{Almeida88,
+  author =       "L. B. Almeida",
+  editor =       "R. Eckmiller and Ch. von der Malsburg",
+  booktitle =    "Neural Computers",
+  title =        "Backpropagation in Perceptrons with Feedback",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Neuss 1987",
+  pages =        "199--208",
+  year =         "1988",
+}
+
+@inproceedings{Almuallim+Dietterich-1991,
+    address = {Anaheim, California},
+    author = {Almuallim, H.  and Dietterich, T. G.},
+    booktitle = {Proceedings of the Ninth National Conference on Artificial Intelligence},
+    pages = {547--552},
+    publisher = {AAAI Press},
+    title = {Learning with many irrelevant features},
+    url = "http://citeseer.ist.psu.edu/almuallim91learning.html",
+    volume = {2},
+    year = {1991}
+}
+
+@article{Almuallim+Dietterich-1994,
+    author = "Hussein Almuallim and Thomas G. Dietterich",
+    title = "Learning Boolean Concepts in the Presence of Many Irrelevant Features",
+    journal = "Artificial Intelligence",
+    volume = "69",
+    number = "1-2",
+    pages = "279-305",
+    year = "1994",
+    url = "citeseer.ist.psu.edu/almuallim94learning.html"
+}
+
+
+@InProceedings{Alspector87,
+  author =       "J. Alspector and R. B. Allen",
+  editor =       "P. Losleben",
+  booktitle =    "Advanced Research in VLSI: Proceedings of the 1987
+                 Stanford Conference",
+  title =        "A Neuromorphic {VLSI} Learning System",
+  publisher =    "MIT Press, Cambridge",
+  pages =        "313--349",
+  year =         "1987",
+}
+
+@InProceedings{Alspector88,
+  author =       "J. Alspector and R. B. Allen and V. Hu and S.
+                 Satyanarayana",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Stochastic Learning Networks and Their Electronic
+                 Implementation",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "9--21",
+  year =         "1988",
+}
+
+@Article{Amari+Wu-99,
+  author =       "S. Amari and S. Wu",
+  title =        "Improving {Support} {Vector} {Machine} classifiers by
+                 modifying kernel functions",
+  journal =      "Neural Networks",
+  volume =       "12",
+  pages =        "783--789",
+  year =         "1999",
+}
+
+@Article{amari00adaptive,
+  author =       "{Shun-ichi} Amari and Hyeyoung Park and Kenji Fukumizu",
+  title =        "Adaptive Method of Realizing Natural Gradient Learning
+                 for Multilayer Perceptrons",
+  journal =      "Neural Computation",
+  volume =       "12",
+  number =       "6",
+  pages =        "1399--1409",
+  year =         "2000",
+  URL =          "citeseer.ist.psu.edu/amari98adaptive.html",
+}
+
+@Article{Amari77,
+  author =       "S. A. Amari",
+  title =        "Dynamics of Pattern Formation in Lateral-Inhibition
+                 Type Neural Fields",
+  journal =      biocyb,
+  volume =       "27",
+  pages =        "77--87",
+  year =         "1977",
+}
+
+@Article{Amari80,
+  author =       "S. A. Amari",
+  title =        "Topographic Organization of Nerve Fields",
+  journal =      bmbiol,
+  volume =       "42",
+  pages =        "339--364",
+  year =         "1980",
+}
+
+@Article{amari98natural,
+  author =       "{Shun-ichi} Amari",
+  title =        "Natural Gradient Works Efficiently in Learning",
+  journal =      "Neural Computation",
+  volume =       "10",
+  number =       "2",
+  pages =        "251--276",
+  year =         "1998",
+  URL =          "citeseer.ist.psu.edu/article/amari98natural.html",
+}
+
+@Article{Amari99,
+  author =       "S. Amari and S. Wu",
+  title =        "Improving Support Vector Machine Classifiers by
+                 Modifying Kernel Functions",
+  journal =      "Neural Networks",
+  volume =       "12",
+  number =       "6",
+  pages =        "783--789",
+  year =         "1999",
+}
+
+@article{AmariS1997,
+	author = {{Shun-ichi} Amari and Noboru Murata and Klaus-Robert M{\"u}ller and Michael Finke  and Howard Hua Yang },
+	journal = {IEEE Transactions on Neural Networks},
+	keywords = {regularization},
+	number = {5},
+	pages = {985--996},
+	title = {Asymptotic statistical theory of overtraining and cross-validation},
+	volume = {8},
+	year = {1997}
+}
+
+@InProceedings{amaya01improvement,
+  author =       "Fredy A. Amaya and Jose-Miguel Bened\`{i}",
+  booktitle =    "Meeting of the Association for Computational
+                 Linguistics",
+  title =        "Improvement of a Whole Sentence Maximum Entropy
+                 Language Model Using Grammatical Features",
+  pages =        "10--17",
+  year =         "2001",
+  URL =          "citeseer.nj.nec.com/505752.html",
+}
+
+@InProceedings{BoufadenLapalmeBengio2001,
+  author =       "N. Boufaden and Lapalme G. and Bengio Y.",
+  booktitle =    "Proceedings of the Natural Language Pacific Rim Symposium, NLPRS-01",
+  title =        "Topic segmentation: First Stage of Dialogue-Based Information extraction Process",
+  year =         "2001",
+}
+
+@Article{Amit85a,
+  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
+  title =        "Spin-Glass Models of Neural Networks",
+  journal =      prA,
+  volume =       "32",
+  pages =        "1007--1018",
+  year =         "1985",
+}
+
+@Article{Amit85b,
+  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
+  title =        "Storing Infinite Numbers of Patterns in a Spin-Glass
+                 Model of Neural Networks",
+  journal =      prl,
+  volume =       "55",
+  pages =        "1530--1533",
+  year =         "1985",
+}
+
+@Article{Amit87a,
+  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
+  title =        "Statistical Mechanics of Neural Networks Near
+                 Saturation",
+  journal =      annphys,
+  volume =       "173",
+  pages =        "30--67",
+  year =         "1987",
+}
+
+@Article{Amit87b,
+  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
+  title =        "Information Storage in Neural Networks with Low Levels
+                 of Activity",
+  journal =      prA,
+  volume =       "35",
+  pages =        "2293--2303",
+  year =         "1987",
+}
+
+@Article{Amit88,
+  author =       "D. Amit",
+  title =        "Neural Networks for Counting Chimes",
+  journal =      PNAS,
+  volume =       "85",
+  pages =        "2141--2145",
+  year =         "1988",
+}
+
+@Book{Amit89,
+  author =       "D. Amit",
+  title =        "Modelling Brain Function",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge",
+  year =         "1989",
+}
+
+@Article{Ammar+Miao-2000,
+  author =       "Hany H. Ammar and Zhouhui Miao",
+  title =        "Parallel Algorithms for the Training Process of a
+                 Neural Network-Based System",
+  journal =      "International Journal of High Performance Computing
+                 Applications",
+  volume =       "14",
+  number =       "1",
+  pages =        "3--25",
+  year =         "2000",
+  URL =          "http://hpc.sagepub.com/cgi/content/abstract/14/1/3",
+  doi =          "10.1177/109434200001400101",
+  eprint =       "http://hpc.sagepub.com/cgi/reprint/14/1/3.pdf",
+}
+
+@Book{Anderson,
+  author =       "T. Anderson",
+  title =        "An Introduction to Multivariate Statistical
+                 Analysis.",
+  publisher =    "John Wiley and Sons",
+  address =      "New York",
+  year =         "1984",
+}
+
+@Article{Anderson68,
+  author =       "J. A. Anderson",
+  title =        "A Memory Model Using Spatial Correlation Functions",
+  journal =      kyb,
+  volume =       "5",
+  pages =        "113--119",
+  year =         "1968",
+}
+
+@Article{Anderson70,
+  author =       "J. A. Anderson",
+  title =        "Two Models for Memory Organization",
+  journal =      mbio,
+  volume =       "8",
+  pages =        "137--160",
+  year =         "1970",
+}
+
+@book{Hinton+Anderson-81,
+ author = {G.E. Hinton and J.A. Anderson},
+ title = {Parallel models of associative memory},
+ publisher = {Lawrence Erlbaum Assoc.},
+ address = {Hillsdale, NJ},
+ year = 1981,
+}
+
+@InCollection{Anderson81,
+  author =       "J. A. Anderson and M. C. Mozer",
+  editor =       "G. E. Hinton and J. A. Anderson",
+  booktitle =    "Parallel Models of Associative Memory",
+  title =        "Categorization and Selective Neurons",
+  publisher =    "Lawrence Erlbaum",
+  address =      "Hillsdale",
+  pages =        "213--236",
+  year =         "1981",
+}
+
+@Article{Anderson86,
+  author =       "D. Z. Anderson",
+  title =        "Coherent Optical Eigenstate Memory",
+  journal =      optlett,
+  volume =       "11",
+  pages =        "56--58",
+  year =         "1986",
+}
+
+@Article{Anderson87,
+  author =       "C. H. Anderson and D. C. Van Essen",
+  title =        "Shifter Circuits: {A} Computational Strategy for
+                 Dynamic Aspects of Visual Processing",
+  journal =      PNAS,
+  volume =       "84",
+  pages =        "6297--6301",
+  year =         "1987",
+}
+
+@Book{Anderson88,
+  editor =       "J. A. Anderson and E. Rosenfeld",
+  title =        "Neurocomputing: Foundations of Research",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1988",
+}
+
+@InProceedings{Anderson89,
+  author =       "S. Anderson and J. W. L. Merrill and R. Port",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "Dynamic Speech Categorization with Recurrent
+                 Networks",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "398--406",
+  year =         "1989",
+}
+
+@Article{Ando+Zhange-JMLR-2005,
+  author =       "Rie Kubota Ando and Tong Zhang",
+  title =        "A Framework for Learning Predictive Structures from
+                 Multiple Tasks and Unlabeled Data",
+  journal =      jmlr,
+  volume =       "6",
+  pages =        "1817--1853",
+  year =         "2005",
+}
+
+@Article{Andrieu03,
+  author =       "Christophe Andrieu and Nando de Freitas and Arnaud
+                 Doucet and Michael I. Jordan",
+  title =        "An Introduction to {MCMC} for Machine Learning",
+  journal =      "Machine Learning",
+  volume =       "50",
+  number =       "1-2",
+  pages =        "5--43",
+  year =         "2003",
+}
+
+@Article{Andrieu2003,
+  author =       "C. Andrieu and N. de Freitas and A. Doucet and M.
+                 Jordan",
+  title =        "An introduction to {MCMC} for machine learning",
+  journal =      "Machine Learning",
+  volume =       "50",
+  pages =        "5--43",
+  year =         "2003",
+}
+
+@Article{Angeniol88,
+  author =       "B. Ang\'eniol and G. de La Croix Vaubois and J.-Y. Le
+                 Texier",
+  title =        "Self-Organizing Feature Maps and the Travelling
+                 Salesman Problem",
+  journal =      nn,
+  volume =       "1",
+  pages =        "289--293",
+  year =         "1988",
+}
+
+@Article{Angluin83,
+  author =       "D. Angluin and C. Smith",
+  title =        "Inductive Inference: Theory and Methods",
+  journal =      "Computing Surveys",
+  volume =       "15",
+  number =       "3",
+  pages =        "237--269",
+  year =         "1983",
+}
+
+@Book{Arbib87,
+  author =       "M. A. Arbib",
+  title =        "Brains, Machines, and Mathematics",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  year =         "1987",
+}
+
+@Book{ARP94,
+  author =       "{Advanced Research Projects Agency}",
+  title =        "Proceedings of the 1994 {ARPA} Human Language
+                 Technology Workshop (Princeton, New Jersey, March
+                 1994)",
+  publisher =    "Morgan Kaufmann",
+  year =         "1994",
+}
+
+@Misc{Asuncion+Newman:2007,
+  author =       "A. Asuncion and D. J. Newman",
+  title =        "{UCI} Machine Learning Repository",
+  institution =  "University of California, Irvine, School of
+                 Information and Computer Sciences",
+  year =         "2007",
+  URL =          "http://www.ics.uci.edu/$\sim$mlearn/MLRepository.html",
+}
+
+@article{ashetal04,
+author = "Ash, J. and Berg, M. and Coiera, E.",
+title = "Some unintended consequences of 
+information technology in health care: the nature of patient care 
+information system-related errors",
+journal = "J Am Med Inform Assoc",
+volume = "11",
+number = 2,
+pages = "104-112",
+year = 2004,
+}
+
+@article{ashetal07,
+author = "Ash, J. and Sittig, D. and Dykstra, R. and Guappone, K. and 
+Carpenter, J. and Seshadri, V.",
+title = "Categorizing the unintended sociotechnical consequences of 
+computerized provider order entry",
+journal = "Int J Med Inform",
+volume = 76,
+number = "Suppl1",
+pages = "21-27",
+year = 2007,
+}
+
+@InProceedings{Atal83,
+  author =       "B. S. Atal",
+  booktitle =    icassp,
+  title =        "Efficient coding of {LPC} parameters by temporal
+                 decomposition",
+  address =      "Boston, MA",
+  pages =        "81--84",
+  year =         "1983",
+}
+
+@PhdThesis{Athaide95,
+  author =       "C. R. Athaide",
+  title =        "Likelihood estimation and state estimation for
+                 nonlinear state space models",
+  school =       "Graduate Group in Managerial Science and Applied
+                 Economics, University of Pennsylvania",
+  address =      "Philadelphia, PA",
+  year =         "1995",
+}
+
+@Book{Atherton-75,
+  author =       "D. P. Atherton",
+  title =        "Nonlinear Control Engineering",
+  publisher =    "Van Nostrand Reinhold",
+  address =      "Wokingam (England)",
+  year =         "1975",
+}
+
+@Article{atkeson96locally,
+  author =       "C. G. Atkeson and A. W. Moore and S. Schaal",
+  title =        "Locally Weighted Learning for Control",
+  journal =      "Artificial Intelligence Review",
+  volume =       "11",
+  pages =        "75--113",
+  year =         "1997",
+}
+
+@InProceedings{Aubert94,
+  author =       "X. Aubert and C. Dugast and H. Ney and V. Steinbiss",
+  booktitle =    icassp,
+  title =        "Large vocabulary continuous speech recognition of
+                 {Wall} {Street} journal data",
+  address =      "Adelaide, Australia",
+  pages =        "129--132",
+  year =         "1994",
+}
+
+@InProceedings{Auer-96,
+  author =       "Peter Auer and Mark Herbster and Manfred K. Warmuth",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Exponentially Many Local Minima for Single Neurons",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "315--322",
+  year =         "1996",
+}
+
+@InProceedings{auer97,
+  author =       "Peter Auer",
+  booktitle =    "Proc. 14th International Conference on Machine
+                 Learning",
+  title =        "On learning from multi-instance examples: Empirical
+                 evaluation of a theoretical approach",
+  publisher =    "Morgan Kaufmann",
+  pages =        "21--29",
+  year =         "1997",
+}
+
+@InProceedings{b-cdmvqfa-97,
+  author =       "Jonathan Baxter",
+  booktitle =    "Proc. 14th International Conference on Machine
+                 Learning",
+  title =        "The canonical distortion measure for vector
+                 quantization and function approximation",
+  publisher =    "Morgan Kaufmann",
+  pages =        "39--47",
+  year =         "1997",
+}
+
+@InCollection{Bach-2007,
+  author =       "Francis Bach",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Active learning for misspecified generalized linear
+                 models",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "",
+  year =         "2007",
+}
+
+@Article{Bachmann87,
+  author =       "C. M. Bachmann and L. N. Cooper and A. Dembo and O.
+                 Zeitouni",
+  title =        "A Relaxation Model for Memory with High Storage
+                 Density",
+  journal =      PNAS,
+  volume =       "84",
+  pages =        "7529--7531",
+  year =         "1987",
+}
+
+@MastersThesis{Bachrach88,
+  author =       "J. Bachrach",
+  title =        "Learning to Represent State",
+  school =       "University of Massachusetts",
+  address =      "Amherst",
+  year =         "1988",
+}
+
+@Article{Back-nc91,
+  author =       "A. D. Back and A. C. Tsoi",
+  title =        "{FIR} and {IIR} Synapses: {A} New Neural Network
+                 Architecture for Time Series Modeling",
+  journal =      nc,
+  volume =       "3",
+  number =       "3",
+  pages =        "375--385",
+  year =         "1991",
+}
+
+@InCollection{Bahadur61,
+  author =       "R. R. Bahadur",
+  editor =       "H. Solomon",
+  booktitle =    "Studies in Item Analysis and Predictdion",
+  title =        "A representation of the joint distribution of
+                 responses to n dichotomous items",
+  publisher =    "Stanford University Press, California",
+  pages =        "158--168",
+  year =         "1961",
+}
+
+@InProceedings{bahl77,
+  author =       "L. R. Bahl and J. K. Baker and R. L. Mercer",
+  booktitle =    "94th Meeting of the Acoustical Society of America",
+  title =        "Perplexity: a measure of difficulty of speech
+                 recognition tasks",
+  address =      "Miami",
+  month =        dec,
+  year =         "1977",
+}
+
+@Article{Bahl83,
+  author =       "L. R. Bahl and F. Jelinek and R. L. Mercer",
+  title =        "A Maximum Likelihood Approach to Continuous Speech
+                 Recognition",
+  journal =      ieeetpami,
+  volume =       "5",
+  number =       "2",
+  pages =        "179--190",
+  month =        mar,
+  year =         "1983",
+}
+
+@InProceedings{Bahl86,
+  author =       "Lalit Bahl and Peter Brown and Peter {deSouza} and Robert Mercer",
+  booktitle =    icassp,
+  title =        "Maximum mutual information estimation of hidden Markov
+                 parameters for speech recognition",
+  address =      "Tokyo, Japan",
+  pages =        "49--52",
+  year =         "1986",
+}
+
+@Article{Bahl87,
+  author =       "L. R. Bahl and P. Brown and P. V. {de Souza} and R. L.
+                 Mercer",
+  title =        "Speech recognition with continuous-parameter hidden
+                 {Markov} models",
+  journal =      "Computer, Speech and Language",
+  volume =       "2",
+  pages =        "219--234",
+  year =         "1987",
+}
+
+@InProceedings{Bahl88,
+  author =       "L. R. Bahl and P. Brown and P. V. de Souza and R. L.
+                 Mercer",
+  booktitle =    icassp,
+  title =        "Speech recognition with continuous-parameter hidden
+                 {Markov} models",
+  address =      "New York, NY",
+  pages =        "40--43",
+  year =         "1988",
+}
+
+@Article{Bailey-Simon-60,
+  author =       "Robert A. Bailey and Leroy Simon",
+  title =        "Two Studies in Automobile Insurance Ratemaking",
+  journal =      "ASTIN Bulletin",
+  volume =       "1",
+  number =       "4",
+  pages =        "192--217",
+  year =         "1960",
+}
+
+@InCollection{Baker75,
+  author =       "J. K. Baker",
+  editor =       "D. R. Reddy",
+  booktitle =    "Speech Recognition",
+  title =        "Stochastic modeling for automatic speech
+                 understanding",
+  publisher =    "Academic Press",
+  address =      "New York",
+  pages =        "521--542",
+  year =         "1975",
+}
+
+@Book{Baker77,
+  author =       "C. T. H. Baker",
+  title =        "The numerical treatment of integral equations",
+  publisher =    "Clarendon Press",
+  address =      "Oxford",
+  year =         "1977",
+}
+
+@InProceedings{Baker98,
+  author =       "D. Baker and A. {McCallum}",
+  booktitle =    "SIGIR'98",
+  title =        "Distributional Clustering of Words for Text
+                 Classification",
+  year =         "1998",
+}
+
+@InProceedings{baker98berkeley,
+  author =       "Collin F. Baker and Charles J. Fillmore and John B.
+                 Lowe",
+  editor =       "Christian Boitet and Pete Whitelock",
+  booktitle =    "Proceedings of the Thirty-Sixth Annual Meeting of the
+                 {Association} for {Computational} {Linguistics} and
+                 Seventeenth International Conference on Computational
+                 Linguistics",
+  title =        "The {Berkeley} {FrameNet} Project",
+  publisher =    "Morgan Kaufmann Publishers",
+  address =      "San Francisco, California",
+  pages =        "86--90",
+  year =         "1998",
+}
+
+@InProceedings{Bakis76,
+  author =       "R. Bakis",
+  booktitle =    "19st Meeting of the Acoustic Society of America",
+  title =        "Continuous Speech Recognition via Centisecond Acoustic
+                 States",
+  month =        apr,
+  year =         "1976",
+}
+
+@Article{bakker03,
+  author =       "Bart Bakker and Tom Heskes",
+  title =        "Task clustering and gating for {B}ayesian multitask
+                 learning",
+  journal =      jmlr,
+  volume =       "4",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA, USA",
+  pages =        "83--99",
+  year =         "2003",
+  ISSN =         "1533-7928",
+}
+
+@Book{Baldi-Brunak-98,
+  author =       "Pierre Baldi and Soren Brunak",
+  title =        "Bioinformatics, the Machine Learning Approach",
+  publisher =    "MIT Press",
+  year =         "1998",
+}
+
+@Article{Baldi89,
+  author =       "Pierre Baldi and Kurt Hornik",
+  title =        "Neural Networks and Principal Component Analysis:
+                 Learning from Examples Without Local Minima",
+  journal =      nn,
+  volume =       "2",
+  pages =        "53--58",
+  year =         "1989",
+}
+
+@Article{Baldi94,
+  author =       "P. Baldi and Y. Chauvin and T. Hunkapiller and M.
+                 {McClure}",
+  title =        "Hidden Markov models of biological primary sequence
+                 information",
+  journal =      "Proc. Nat. Acad. Sci. (USA)",
+  volume =       "91",
+  number =       "3",
+  pages =        "1059--1063",
+  year =         "1995",
+}
+
+@Article{Ballard81,
+  author =       "D. H. Ballard",
+  title =        "Generalizing the Hough Transform to Detect Arbitrary
+                 Shapes",
+  journal =      "Pattern Recognition",
+  volume =       "13",
+  number =       "2",
+  pages =        "111--122",
+  year =         "1981",
+}
+
+@InProceedings{Baluja97,
+  author =       "S. Baluja",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "Genetic Algorithms and Explicit Search Statistics",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "",
+  year =         "1997",
+}
+
+@Article{Bar-Shalom78,
+  author =       "Y. Bar-Shalom",
+  title =        "Tracking methods in a multi-target environment",
+  journal =      "IEEE Trans. on Aut. Control",
+  volume =       "23",
+  pages =        "618--626",
+  year =         "1978",
+}
+
+@Book{Bar-Shalom93,
+  author =       "Y. Bar-Shalom and {X.-R.} Li",
+  title =        "Estimation and Tracking",
+  publisher =    "Artech House",
+  address =      "Boston, MA",
+  year =         "1993",
+}
+
+@InProceedings{Barber+Williams-nips9,
+  author =       "D. Barber and C. K. I. Williams",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "Gaussian Processes for {Bayesian} Classification via
+                 Hybrid Monte Carlo",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "340--346",
+  year =         "1997",
+}
+
+@InProceedings{Bareiss87,
+  author =       "E. R. Bareiss and B. Porter",
+  booktitle =    "Proceedings of the 4th International Workshop on
+                 Machine Learning",
+  title =        "Protos: An Exemplar-Based Learning Apprentice",
+  publisher =    "Morgan Kaufmann",
+  address =      "Irvine, CA",
+  pages =        "12--23",
+  year =         "1987",
+}
+
+@Article{Barhen89,
+  author =       "J. Barhen and S. Gulati and M. Zak",
+  title =        "Neural Learning of Constrained Nonlinear
+                 Transformations",
+  journal =      computer,
+  pages =        "67--76",
+  month =        jun,
+  year =         "1989",
+}
+
+@article{Nykamp+Ringach-2002,
+ author = {D.Q. Nykamp and D.L. Ringach},
+ title = {Full identification of a linear-nonlinear system via cross-correlation analysis},
+ journal = {Journal of Vision}, 
+ volume = 2,
+ number = 1, 
+ pages = {1--11},
+ year = 2002,
+}
+
+@article{Wilson+Cowan-72,
+ author = {Hugh R. Wilson and Jack D. Cowan},
+ title = {Excitatory and inhibitory interactions in localized populations of model neurons},
+ journal = {Biophysiology Journal},
+ volume = 12,
+ pages = {1--24},
+ year = 1972,
+}
+
+@Article{Barlow89,
+  author =       "H. B. Barlow",
+  title =        "Unsupervised Learning",
+  journal =      nc,
+  volume =       "1",
+  pages =        "295--311",
+  year =         "1989",
+}
+
+@article{Barlow-2001,
+    address = {Cambridge, UK.},
+    author = {H. Barlow},
+    issn = {0954-898X},
+    journal = {Network: Computation in Neural Systems},
+    month = {August},
+    number = {3},
+    pages = {241--253},
+    title = {Redundancy reduction revisited},
+    url = {http://view.ncbi.nlm.nih.gov/pubmed/11563528},
+    volume = {12},
+    year = {2001},
+}
+
+@InProceedings{Barron+Barron88,
+  author =       "A. R. Barron and R. L. Barron",
+  editor =       "E. Wegman",
+  booktitle =    "Computing Science and Statistics, Proc. 20th Symp.
+                 Interface",
+  title =        "Statistical learning networks: {A} unifying view",
+  publisher =    "Amer. Statist. Assoc.",
+  address =      "Washington, DC",
+  pages =        "192--203",
+  year =         "1988",
+}
+
+@InProceedings{Barron89,
+  author =       "A. R. Barron",
+  booktitle =    "Proc. of the 28th conf. on Decision and Control",
+  title =        "Statistical properties of artificial neural networks",
+  address =      "Tampa, Florida",
+  pages =        "280--285",
+  year =         "1989",
+}
+
+@incollection{Barron91,
+  author =       "Andrew E.~Barron",
+  title =        "Complexity Regularization with Application to Artificial Neural Networks",
+  booktitle =      "Nonparametric Functional Estimation and Related Topics",
+  pages =        "561--576",
+  editor = "G.~Roussas",
+  year =         "1991",
+  publisher = "Kluwer Academic Publishers"
+}
+
+
+@Article{Bartal95,
+  author =       "Jie Lin and Yair Bartal and Robert E. Uhrig",
+  title =        "Nuclear Power Plant Transient Diagnostics Using
+                 Artificial Neural Networks that Allow {"}don't know{"}
+                 Classifications",
+  journal =      "Nuclear Technology",
+  volume =       "110",
+  pages =        "436--449",
+  month =        jun,
+  year =         "1995",
+}
+
+@Article{Bartlett+Uhrig92,
+  author =       "E. B. Bartlett and R. E. Uhrig",
+  title =        "Nuclear Power Plant Status Diagnostics Using an
+                 Artificial Neural Network",
+  journal =      "Nuclear Technology",
+  volume =       "97",
+  month =        mar,
+  year =         "1992",
+}
+
+@Article{Bartlett46,
+  author =       "M. S. Bartlett",
+  title =        "On the theoritical specification of sampling
+                 properties of autocorrelated time series",
+  journal =      "J. Royal Stat. Soc. B",
+  volume =       "8",
+  pages =        "27--41",
+  year =         "1946",
+}
+
+@Article{Bartlett92,
+  author =       "P. L. Bartlett and T. Downs",
+  title =        "Using Random Weights to train Multilayer Networks of
+                 Hard-Limiting Units",
+  journal =      ieeetrnn,
+  volume =       "3",
+  number =       "2",
+  pages =        "202--210",
+  year =         "1992",
+}
+
+@TechReport{Barto-tr91,
+  author =       "A. G. Barto and S. Bradtke and S. P. Singh",
+  title =        "Real-Time Learning and {Control} Using Asynchronous
+                 Dynamic Programming",
+  number =       "91-57",
+  institution =  "Univ. of Massachusetts (Computer Science)",
+  address =      "Amherst MA",
+  year =         "1991",
+}
+
+@Article{Barto81,
+  author =       "A. G. Barto and R. S. Sutton and P. S. Brouwer",
+  title =        "Associative Search Network: Reinforcement Learning
+                 Associative Memory",
+  journal =      "Biological Cybernetics",
+  volume =       "40",
+  year =         "1981",
+}
+
+@Article{Barto83,
+  author =       "A. G. Barto and R. S. Sutton and C. W. Anderson",
+  title =        "Neuronlike Adaptive Elements That Can Solve Difficult
+                 Learning Control Problems",
+  journal =      ieeesmc,
+  volume =       "13",
+  year =         "1983",
+}
+
+@Article{Barto85,
+  author =       "A. G. Barto and P. Anandan",
+  title =        "Pattern Recognizing Stochastic Learning Automata",
+  journal =      ieeesmc,
+  volume =       "15",
+  pages =        "360--375",
+  year =         "1985",
+}
+
+@InProceedings{Barto87,
+  author =       "A. G. Barto and M. I. Jordan",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Gradient Following Without Back-Propagation in Layered
+                 Networks",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "629--636",
+  year =         "1987",
+}
+
+@InCollection{Barto91,
+  author =       "A. G. Barto and R. S. Sutton and C. J. C. H. Watkins",
+  editor =       "M. Gabriel and J. W. Moore",
+  booktitle =    "Learning and Computational Neuroscience",
+  title =        "Learning and Sequential Decision Making",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1991",
+}
+
+@InCollection{Barto92,
+  author =       "A. G. Barto",
+  editor =       "W. T Miller and R. S. Sutton and P. J. Werbos",
+  booktitle =    "Neural Networks for Control",
+  title =        "Connectionist learning for control: an overview",
+  publisher =    "MIT Press",
+  year =         "1992",
+}
+
+@TechReport{Barto_tr91,
+  author =       "A. G. Barto and S. Bradtke and S. P. Singh",
+  title =        "Real-Time Learning and {CO}ntrol Using Asynchronous
+                 Dynamic Programming",
+  number =       "91-57",
+  institution =  "Univ. of Massachusetts (Computer Science)",
+  address =      "Amherst MA",
+  year =         "1991",
+}
+
+@Article{bassiouni95,
+  author =       "M. A. Bassiouni and A. Mukherjee",
+  title =        "Efficient Decoding of Compressed Data",
+  journal =      "Journal of the American Society for Information
+                 Science",
+  volume =       "46",
+  number =       "1",
+  pages =        "1--8",
+  year =         "1995",
+}
+
+@Article{Basu94,
+  author =       "A. Basu and E. B. Bartlett",
+  title =        "Detecting Faults in a Nuclear Power Plant by Using
+                 Dynamic Node Architecture Artificial Neural Networks",
+  journal =      "Nuclear Science and Engineering",
+  volume =       "116",
+  month =        apr,
+  year =         "1994",
+}
+
+@Article{battiti-89,
+  author =       "R. Battiti",
+  title =        "Accelerated Backpropagation Learning: Two Optimization
+                 Methods",
+  journal =      "Complex Systems",
+  volume =       "3",
+  pages =        "331--342",
+  year =         "1989",
+}
+
+@InProceedings{battiti-masulli-90,
+  author =       "E. Battiti and F. Masulli",
+  booktitle =    "Proceedings of Internationla Neural Network Conference
+                 (INNC 90, Paris)",
+  title =        "{BFGS} optimization for faster and automated
+                 supervised learning",
+  pages =        "757--760",
+  year =         "1990",
+}
+
+@Article{Battiti92,
+  author =       "T. Battiti",
+  title =        "First- and Second-Order Methods for Learning: Between
+                 Steepest Descent and {Newton}'s Method",
+  journal =      "Neural Computation",
+  volume =       "4",
+  type =         "Review",
+  number =       "2",
+  pages =        "141--166",
+  year =         "1992",
+}
+
+@Article{battiti:1994:ieeetnn,
+  author =       "R. Battiti",
+  title =        "Using Mutual Information for Selecting Features in
+                 Supervised Neural Net Learning",
+  journal =      "{IEEE} Transaction on Neural Networks",
+  volume =       "5",
+  number =       "4",
+  pages =        "537--550",
+  year =         "1994",
+}
+
+@article{Baudat+Anouar-2000,
+    author = {G. Baudat and F. Anouar},
+    title = {Generalized Discriminant Analysis Using a Kernel Approach},
+    journal = {Neural Computation},
+    volume = {12},
+    number = {10},
+    year = {2000},
+    issn = {0899-7667},
+    pages = {2385--2404},
+    doi = {http://dx.doi.org/10.1162/089976600300014980},
+    publisher = {MIT Press},
+    address = {Cambridge, MA, USA},
+}
+
+@Article{Baum66,
+  author =       "L. E. Baum and T. Petrie",
+  title =        "Statistical Inference for Probabilistic Functions of
+                 Finite State {Markov} Chains",
+  journal =      "Ann. Math. Stat.",
+  volume =       "37",
+  pages =        "1559--1563",
+  year =         "1966",
+}
+
+@Article{Baum67,
+  author =       "L. E. Baum and J. Eagon",
+  title =        "An inequality with applications to statistical
+                 prediction for functions of {Markov} processes and to a
+                 model of ecology",
+  journal =      "Bull. Amer. Math. Soc.",
+  volume =       "73",
+  pages =        "360--363",
+  year =         "1967",
+}
+
+@Article{Baum70,
+  author =       "L. E. Baum and T. Petrie and G. Soules and N. Weiss",
+  title =        "A maximization technique occuring in the statistical
+                 analysis of probabilistic functions of {Markov}
+                 chains",
+  journal =      "Ann. Math. Statistic.",
+  volume =       "41",
+  pages =        "164--171",
+  year =         "1970",
+}
+
+@Article{Baum72,
+  author =       "L. E. Baum",
+  title =        "An inequality and associated maximization technique in
+                 statistical estimation for probabilistic functions of a
+                 {Markov} process",
+  journal =      "Inequalities",
+  volume =       "3",
+  pages =        "1--8",
+  year =         "1972",
+}
+
+@InProceedings{Baum86,
+  author =       "E. B. Baum",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "Towards Practical ``Neural'' Computation for
+                 Combinatorial Optimization Problems",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "53--58",
+  year =         "1986",
+}
+
+@InProceedings{Baum88,
+  author =       "E. B. Baum and F. Wilczek",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Supervised Learning of Probability Distributions by
+                 Neural Networks",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "52--61",
+  year =         "1988",
+}
+
+@Article{Baum89,
+  author =       "E. B. Baum and D. Haussler",
+  title =        "What Size Net Gives Valid Generalization?",
+  journal =      nc,
+  volume =       "1",
+  pages =        "151--160",
+  year =         "1989",
+}
+
+@Article{BaumNote,
+  author =       "E. B. Baum",
+  title =        "Review of {J}. {S}. {Judd}'s book {\em {Neural}
+                 {Network} {Design} and the {Complexity} of
+                 {Learning}}",
+  journal =      ieeetrnn,
+  volume =       "2",
+  number =       "1",
+  pages =        "181--182",
+  year =         "1991",
+}
+
+@Article{baxter00,
+  author =       "Jonathan Baxter",
+  title =        "A Model of Inductive Bias Learning.",
+  journal =      "J. Artif. Intell. Res. (JAIR)",
+  volume =       "12",
+  pages =        "149--198",
+  year =         "2000",
+}
+
+@InProceedings{baxter95a,
+  author =       "Jonathan Baxter",
+  booktitle =    colt95,
+  title =        "Learning Internal Representations",
+  publisher =    "ACM Press",
+  address =      "Santa Cruz, California",
+  pages =        "311--320",
+  year =         "1995",
+  url =          "http://citeseer.ist.psu.edu/baxter95learning.html",
+}
+
+@Unpublished{baxter95b,
+  author =       "Jonathan Baxter",
+  title =        "The Canonical Metric for Vector Quantization",
+  year =         "1995",
+  note =         "submitted to Information and Computation",
+}
+
+@InProceedings{baxter96,
+  author =       "Jonathan Baxter",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Learning Model Bias",
+  volume =       "8",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "169--175",
+  year =         "1996",
+}
+
+@Article{baxter97,
+  author =       "Jonathan Baxter",
+  title =        "A {Bayesian}/information theoretic model of learning via
+                 multiple task sampling",
+  journal =      "Machine Learning",
+  volume =       "28",
+  pages =        "7--40",
+  year =         "1997",
+}
+
+@Article{baxter97a,
+  author =       "Jonathan Baxter",
+  title =        "A {Bayesian}/Information theoretic model of learning to
+                 learn via multiple task sampling",
+  journal =      "Machine Learning",
+  volume =       "28",
+  pages =        "7--40",
+  year =         "1997",
+}
+
+@InProceedings{Becker89,
+  author =       "S. Becker and Y. {LeCun}",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "Improving the Convergence of Back-Propagation Learning
+                 with Second Order Methods",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "29--37",
+  year =         "1989",
+}
+
+@InProceedings{Belkin+al-2004,
+  author =       "Mikhail Belkin and Irina Matveeva and Partha Niyogi",
+  editor =       "John Shawe-Taylor and Yoram Singer",
+  booktitle =    colt04,
+  title =        "Regularization and Semi-supervised Learning on Large
+                 Graphs",
+  publisher =    "Springer",
+  pages =        "624-638",
+  year =         "2004",
+}
+
+@InProceedings{Belkin+Niyogi-2002,
+  author =       "Mikhail Belkin and Partha Niyogi",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Laplacian Eigenmaps and Spectral Techniques for
+                 Embedding and Clustering",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+  original =     "orig/AA42.ps",
+}
+
+@TechReport{Belkin+Niyogi-2002-01,
+  author =       "Mikhail Belkin and Partha Niyogi",
+  title =        "Laplacian Eigenmaps for Dimensionality Reduction and
+                 Data Representation",
+  number =       "TR-2002-01",
+  institution =  "University of Chicago, Computer Science",
+  year =         "2002",
+}
+
+@TechReport{Belkin+Niyogi-2002-ss,
+  author =       "Mkhail Belkin and Partha Niyogi",
+  title =        "Semi-supervised learning on manifolds",
+  number =       "TR-2002-12",
+  institution =  "University of Chicago, Computer Science",
+  year =         "2002",
+}
+
+@Article{Belkin+Niyogi-2003,
+  author =       "Mikhail Belkin and Partha Niyogi",
+  title =        "Laplacian Eigenmaps for Dimensionality Reduction and
+                 Data Representation",
+  journal =      "Neural Computation",
+  volume =       "15",
+  number =       "6",
+  pages =        "1373--1396",
+  year =         "2003",
+}
+
+@InProceedings{Belkin+Niyogi-nips2003,
+  author =       "Mikhail Belkin and Partha Niyogi",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Using Manifold Structure for Partially Labeled
+                 Classification",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  year =         "2003",
+}
+
+@article{BelkinM2006,
+	address = {Cambridge, MA, USA},
+	author = {Belkin, Mikhail   and Niyogi, Partha   and Sindhwani, Vikas  },
+	issn = {1533-7928},
+	journal = jmlr,
+	pages = {2399--2434},
+	publisher = {MIT Press},
+	title = {Manifold Regularization: A Geometric Framework for Learning from Labeled and Unlabeled Examples},
+	volume = {7},
+	year = {2006}
+}
+
+@Article{Bell-Sejnowski95,
+  author =       "Anthony J. Bell and Terrence J. Sejnowski",
+  title =        "An information maximisation approach to blind
+                 separation and blind deconvolution",
+  journal =      "Neural Computation",
+  volume =       "7",
+  number =       "6",
+  pages =        "1129--1159",
+  year =         "1995",
+}
+
+@InProceedings{Bellagarda+Nahamoo89,
+  author =       "J. R. Bellegarda and D. Nahamoo",
+  booktitle =    icassp,
+  title =        "Tied Mixture Continuous Parameter Models for Large
+                 Vocabulary Isolated Speech Recognition",
+  address =      "Glasgow, Scotland",
+  pages =        "13--16",
+  year =         "1989",
+}
+
+@InProceedings{Bellegarda97,
+  author =       "J. R. Bellegarda",
+  booktitle =    "Proceedings of Eurospeech 97",
+  title =        "A latent semantic analysis framework for large--span
+                 language modeling",
+  address =      "Rhodes, Greece",
+  pages =        "1451--1454",
+  year =         "1997",
+}
+
+@Book{Bellman57,
+  author =       "R. E. Bellman",
+  title =        "Dynamic Programming",
+  publisher =    "Princeton University Press",
+  address =      "NJ",
+  year =         "1957",
+}
+
+@Book{Bellman61,
+  author =       "R. Bellman",
+  title =        "Adaptive Control Processes: {A} Guided Tour",
+  publisher =    "Princeton University Press",
+  address =      "New Jersey",
+  year =         "1961",
+}
+
+@Book{Bellman74,
+  author =       "R. Bellman",
+  title =        "Introduction to Matrix Analysis",
+  publisher =    "McGraw-Hill",
+  address =      "New York, NY",
+  edition =      "2nd",
+  year =         "1974",
+}
+
+@InProceedings{ben-david03,
+  author =       "Shai Ben-David and Reba Schuller",
+  booktitle =    colt03,
+  title =        "Exploiting Task Relatedness for Mulitple Task
+                 Learning.",
+  crossref =     "colt03",
+  pages =        "567--580",
+  year =         "2003",
+}
+
+@InProceedings{BenDucVin01,
+  author =       "Yoshua Bengio and R\'ejean Ducharme and Pascal
+                 Vincent",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "A Neural Probabilistic Language Model",
+  publisher =    "MIT Press",
+  pages =        "932--938",
+  year =         "2001",
+}
+
+@InProceedings{BenDucVin01-small,
+  author =       "Yoshua Bengio and R\'ejean Ducharme and Pascal
+                 Vincent",
+  editor =       "Todd K. Leen and Thomas G. Dietterich and Volker
+                 Tresp",
+  booktitle =    "Advances in NIPS 13",
+  title =        "A Neural Probabilistic Language Model",
+  publisher =    "MIT Press",
+  pages =        "932--938",
+  year =         "2001",
+}
+
+@InProceedings{BenDucVin01-short,
+  author =       "Y. Bengio and R. Ducharme and P. Vincent",
+  booktitle =    "Adv. Neural Inf. Proc. Sys. 13",
+  title =        "A Neural Probabilistic Language Model",
+  pages =        "932--938",
+  year =         "2001",
+}
+
+@TechReport{Bengio+al-2004,
+  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le Roux}",
+  title =        "Efficient Non-Parametric Function Induction in
+                 Semi-Supervised Learning",
+  number =       "1247",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "2004",
+}
+
+@InCollection{Bengio+al-2005,
+  author =       "Yoshua Bengio and Nicolas {Le Roux} and Pascal Vincent and
+                 Olivier Delalleau and Patrice Marcotte",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Convex Neural Networks",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "123--130",
+  year =         "2006",
+}
+
+@InCollection{Bengio+al-2005-small,
+  author =       "Yoshua Bengio and Nicolas {Le Roux} and Pascal Vincent
+                 and Olivier Delalleau and Patrice Marcotte",
+  booktitle =    "NIPS 18",
+  title =        "Convex Neural Networks",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "123--130",
+  year =         "2006",
+}
+
+@InCollection{Bengio+al-spectral-2006-short,
+  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le
+                 Roux} and Jean-Francois Paiement and Pascal Vincent
+                 and Marie Ouimet",
+  editor =       "Isabelle Guyon and Steve Gunn and Masoud Nikravesh and
+                 Lofti Zadeh",
+  booktitle =    "Feature Extraction, Foundations and Applications",
+  title =        "Spectral Dimensionality Reduction",
+  publisher =    "Springer",
+  year =         "2006",
+}
+
+@InProceedings{Bengio+Bengio-NIPS99,
+  author =       "Yoshua Bengio and Samy Bengio",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Modeling High-Dimensional Discrete Data with
+                 Multi-Layer Neural Networks",
+  publisher =    "MIT Press",
+  pages =        "400--406",
+   year =         "2000",
+}
+
+@Article{Bengio+Bengio-trnn2000,
+  author =       "S. Bengio and Y. Bengio",
+  title =        "Taking on the Curse of Dimensionality in Joint
+                 Distributions Using Neural Networks",
+  journal =      "IEEE Transactions on Neural Networks, special issue on
+                 Data Mining and Knowledge Discovery",
+  volume =       "11",
+  number =       "3",
+  pages =        "550--557",
+  year =         "2000",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/jdm.pdf",
+}
+
+@Article{Bengio+Bengio-trnn2000-small,
+  author =       "S. Bengio and Y. Bengio",
+  title =        "Taking on the Curse of Dimensionality in Joint
+                 Distributions Using Neural Networks",
+  journal =      "IEEE Trans. Neural Networks",
+  volume =       "11",
+  number =       "3",
+  pages =        "550--557",
+  year =         "2000",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/jdm.pdf",
+}
+
+@Article{Bengio+Chapados2003,
+  author =       "Yoshua Bengio and Nicolas Chapados",
+  title =        "Extensions to Metric-Based Model Selection",
+  journal =      jmlr,
+  volume =       "3",
+  pages =        "1209--1227",
+  month =        mar,
+  year =         "2003",
+  note =         "Special Issue on Feature Selection",
+}
+
+@TechReport{Bergstra-TR2008,
+  author =       "James Bergstra and Yoshua Bengio and Jerome Louradour",
+  title =        "Image Classification with Biologically Motivated Neuron Models",
+  number =       "---",
+  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
+  year =         "2008",
+}
+
+@article{Bergstra-2009,
+  author =       "James Bergstra and Yoshua Bengio and Jerome Louradour",
+  title =        "Suitability of Complex Cell Models for Object Categorization",
+  journal = {Computational Neuroscience},
+  publisher = "submitted",
+  year = 2008,
+}
+
+@TechReport{Bengio+Frasconi94a,
+  author =       "Y. Bengio and P. Frasconi",
+  title =        "An {EM} Approach to Learning Sequential Behavior",
+  number =       "Tech. Report. DSI 11/94",
+  institution =  "Universit\`a di Firenze",
+  year =         "1994",
+}
+
+@article{Bengio-nc-2004,
+ author = {Yoshua Bengio and Olivier Delalleau and Nicolas Le Roux and Jean-François Paiement and Pascal Vincent and Marie Ouimet},
+ title = {Learning eigenfunctions links spectral embedding and kernel {PCA}},
+ journal = {Neural Computation},
+ volume = 16,
+ number = 10,
+ year = 2004,
+ pages = {2197--2219},
+}
+
+@article{Bengio-nc-2004-small,
+ author = {Yoshua Bengio and Olivier Delalleau and Nicolas Le Roux and Jean-François Paiement and Pascal Vincent and Marie Ouimet},
+ title = {{\small{Learning eigenfunctions links spectral embedding and kernel {PCA}}}},
+ journal = {Neural Comp.},
+ volume = {16(10)},
+ year = 2004,
+ pages = {2197--2219},
+}
+
+@Article{Bengio+Grandvalet-JMLR-2004,
+  author =       "Yoshua Bengio and Yves Grandvalet",
+  title =        "No Unbiased Estimator of the Variance of {K}-Fold
+                 Cross-Validation",
+  journal =      jmlr,
+  volume =       "5",
+  pages =        "1089--1105",
+  year =         "2004",
+}
+
+@TechReport{Bengio+Grandvalet-TR-2003,
+  author =       "Yoshua Bengio and Yves Grandvalet",
+  title =        "No Unbiased Estimator of the Variance of {K}-Fold
+                 Cross-Validation",
+  number =       "TR-2003-1234",
+  institution =  "Universite de Montreal, dept. IRO",
+  year =         "2003",
+}
+
+@InCollection{Bengio+Lecun-chapter2007,
+  author =       "Yoshua Bengio and Yann {LeCun}",
+  editor =       "L. Bottou and O. Chapelle and D. DeCoste and J.
+                 Weston",
+  booktitle =    "Large Scale Kernel Machines",
+  title =        "Scaling Learning Algorithms towards {AI}",
+  publisher =    "MIT Press",
+  year =         "2007",
+}
+
+@InCollection{Bengio+Lecun-chapter2007-small,
+  author =       "Y. Bengio and Y. {LeCun}",
+  booktitle =    "Large Scale Kernel Machines",
+  title =        "Scaling Learning Algorithms towards {AI}",
+  year =         "2007",
+}
+
+@InProceedings{Bengio+LeCun94b,
+  author =       "Yoshua Bengio and Yann {LeCun}",
+  booktitle =    ICPR94,
+  title =        "Word Normalization For On-Line Handwritten Word
+                 Recognition",
+  pages =        "409--413",
+  year =         "1994",
+}
+
+@Article{Bengio+Monperrus+Larochelle-2006,
+  author =       "Yoshua Bengio and Martin Monperrus and Hugo
+                 Larochelle",
+  title =        "Nonlocal Estimation of Manifold Structure",
+  journal =      "Neural Computation",
+  volume =       "18",
+  number =       "10",
+  pages =        "2509--2528",
+  year =         "2006",
+}
+
+@InProceedings{Bengio+Monperrus-2005,
+  author =       "Yoshua Bengio and Martin Monperrus",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Non-Local Manifold Tangent Learning",
+  publisher =    "{MIT} Press",
+  year =         "2005",
+  pages =        "129--136",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/tangent\_learner\_nips2004.pdf",
+}
+
+@InProceedings{Bengio+Senecal-2003-small,
+  author =       "Yoshua Bengio and Jean-S\'ebastien Sen\'ecal",
+  booktitle =    "Proceedings of AISTATS 2003",
+  title =        "Quick Training of Probabilistic Neural Nets by
+                 Importance Sampling",
+  year =         "2003",
+}
+
+@TechReport{Bengio+Vincent+Paiement-TR2003,
+  author =       "Yoshua Bengio and Pascal Vincent and Jean-Fran{\cc}ois
+                 Paiement",
+  title =        "Learning Eigenfunctions of Similarity: Linking
+                 Spectral Clustering and Kernel {PCA}",
+  number =       "1232",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "2003",
+  URL =          "www.iro.umontreal.ca/~lisa/pointeurs/TR1232.pdf",
+}
+
+@TechReport{Bengio-decision-trees-TR-2007,
+  author =       "Yoshua Bengio and Olivier Delalleau and Clarence
+                 Simard",
+  title =        "Trees do not Generalize to New Variations",
+  number =       "",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "2007",
+}
+
+@TechReport{Bengio-decision-trees07,
+  author =       "Yoshua Bengio and Olivier Delalleau and Clarence
+                 Simard",
+  title =        "Decision Trees do not Generalize to New Variations",
+  number =       "1304",
+  institution =  "Universite de Montreal, Dept. IRO",
+  year =         "2007",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf",
+}
+
+%I deprecate the following one as this is a duplicate of the preceding tech report!
+%Their was only one .tex file that was using it. I modified it.
+@TechReport{Bengio-Trees-TR2007,
+  author =       "Yoshua Bengio and Olivier Delalleau and Clarence
+                 Simard",
+  title =        "Decision Trees do not Generalize to New Variations",
+  number =       "1304",
+  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
+  year =         "2007",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf",
+}
+
+@Article{Bengio-hmms99,
+  author =       "Yoshua Bengio",
+  title =        "Markovian Models for Sequential Data",
+  journal =      "Neural Computing Surveys",
+  volume =       "2",
+  pages =        "129--162",
+  year =         "1999",
+}
+
+@Article{bengio-hyper-NC00,
+  author =       "Yoshua Bengio",
+  title =        "Gradient-Based Optimization of Hyperparameters",
+  journal =      "Neural Computation",
+  volume =       "12",
+  number =       "8",
+  pages =        "1889--1900",
+  year =         "2000",
+}
+
+@TechReport{bengio-hyper-TR98,
+  author =       "Yoshua Bengio",
+  title =        "Continuous Optimization of Hyper-Parameters for
+                 Non-{IID} Data",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "1998",
+  note =         "unpublished manuscript",
+}
+
+@Article{Bengio-Hyper-Weight-Decay-nips,
+  author =       "Simon Latendresse and Yoshua Bengio",
+  title =        "Linear Regression and the Optimization of
+                 Hyper-Parameters",
+  journal =      "submitted to NIPS'99",
+  year =         "1999",
+}
+
+@TechReport{Bengio-Hyper-Weight-Decay-TR,
+  author =       "Yoshua Bengio and Simon Latendresse",
+  title =        "Soft Variable Selection with Numerical Optimization of
+                 Weight Decays",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "1999",
+  note =         "in preparation",
+}
+
+@Article{Bengio-ijns97,
+  author =       "Yoshua Bengio",
+  title =        "Using a Financial Training Criterion Rather than a
+                 Prediction Criterion",
+  journal =      "International Journal of Neural Systems",
+  year =         "1997",
+  volume =       {8},
+  number =       {4},
+  note =         "Special issue on noisy time-series",
+  pages =        {433--443},
+  URL =          "www.iro.umontreal.ca/~lisa/pointeurs/profitcost.ps",
+}
+
+@Article{Bengio-IEEETRNN-2001,
+  author =       "Yoshua Bengio and Vincent-Philippe Lauzon and R\'ejean
+                 Ducharme",
+  title =        "Experiments on the Application of {IOHMM}s to Model
+                 Financial Returns Series",
+  journal =      ieeetrnn,
+  volume = 12,
+  number = 1,
+  pages = {113--123},
+  year =         "2001",
+}
+
+@InProceedings{Bengio-Larochelle-NLMP-NIPS-2006,
+  author =       "Yoshua Bengio and Hugo Larochelle and Pascal Vincent",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Non-Local Manifold Parzen Windows",
+  publisher =    "MIT Press",
+  pages =        "115--122",
+  year =         "2006",
+}
+
+@TechReport{Bengio-Larochelle-NLMP-TR-2005,
+  author =       "Yoshua Bengio and Hugo Larochelle",
+  title =        "Non-Local Manifold Parzen Windows",
+  number =       "1264",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "2005",
+}
+
+%have been rejected and later accepted to NIPS in Bengio-localfailure-NIPS-2006
+@InProceedings{Bengio-localfailure-icml-2005,
+  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le
+                 Roux}",
+  booktitle =    "submitted to ICML 2005",
+  title =        "The Curse of Dimensionality for Local Kernel
+                 Machines",
+  year =         "2005",
+}
+
+@InCollection{Bengio-localfailure-NIPS-2006,
+  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le Roux}",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "The Curse of Highly Variable Functions for Local
+                 Kernel Machines",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =        "107--114",
+  year =         "2006",
+}
+
+@InCollection{Bengio-localfailure-NIPS-2006-small,
+  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le Roux}",
+  booktitle =    "NIPS 18",
+  title =        "The Curse of Highly Variable Functions for Local
+                 Kernel Machines",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =        "107--114",
+  year =         "2006",
+}
+
+@InProceedings{Bengio-localfailure-snowbird-2005,
+  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le
+                 Roux}",
+  booktitle =    "The Learning Workshop",
+  title =        "The Curse of Dimensionality for Local Kernel
+                 Machines",
+  address =      "Snowbird, Utah",
+  year =         "2005",
+}
+
+@InProceedings{HonglakLee-2007,
+  author =       "Honglak Lee and Alexis Battle and Rajat Raina and Andrew Ng",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Efficient sparse coding algorithms",
+  publisher =    "MIT Press",
+  pages =        "801--808",
+  year =         "2007",
+}
+
+@InProceedings{Bengio-nips-2006-small,
+  author =       "Y. Bengio and P. Lamblin and D. Popovici and
+                 H. Larochelle",
+  booktitle =    "Advances in NIPS 19",
+  title =        "Greedy Layer-Wise Training of Deep Networks",
+  year =         "2007",
+}
+
+@InProceedings{Bengio-nips-2006-short,
+  author =       "Y. Bengio and P. Lamblin and D. Popovici and
+                 H. Larochelle",
+  booktitle =    "Adv. Neural Inf. Proc. Sys. 19",
+  title =        "Greedy Layer-Wise Training of Deep Networks",
+  pages =        "153--160",
+  year =         "2007",
+}
+
+@InProceedings{Bengio-nips2004,
+  author =       "Yoshua Bengio and Jean-Fran\c{cois} Paiement and Pascal
+                 Vincent and Olivier Delalleau and Nicolas {Le Roux} and
+                 Marie Ouimet",
+  editor =       NIPS16ed,
+  booktitle =    NIPS16,
+  title =        "Out-of-Sample Extensions for {LLE}, {Isomap}, {MDS},
+                 {Eigenmaps}, and {Spectral} {Clustering}",
+  publisher =    "MIT Press",
+  year =         "2004",
+}
+
+@InProceedings{Bengio-nips2003,
+  author =       "Yoshua Bengio and Jean-Fran\c{cois} Paiement and Pascal
+                 Vincent and Olivier Delalleau and Nicolas {Le Roux} and
+                 Marie Ouimet",
+  editor =       NIPS16ed,
+  booktitle =    NIPS16,
+  title =        "Out-of-Sample Extensions for {LLE}, {Isomap}, {MDS},
+                 {Eigenmaps}, and {Spectral} {Clustering}",
+  publisher =    "MIT Press",
+  year =         "2004",
+}
+
+@InCollection{Bengio-NIPS2007,
+  author =       "Yoshua Bengio and Pascal Lamblin and Dan Popovici and
+                 Hugo Larochelle",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Greedy Layer-Wise Training of Deep Networks",
+  publisher =    "MIT Press",
+  pages =        "153--160",
+  year =         "2007",
+}
+
+@InProceedings{Bengio-nnlm2001,
+  author =       "Yoshua Bengio and R{\'e}jean Ducharme and Pascal Vincent",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "A Neural Probabilistic Language Model",
+  publisher =    "{MIT} Press",
+  pages =        "933--938",
+  year =         "2001",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/nips00-lm.ps",
+}
+
+@Article{Bengio-nnlm2003,
+  author =       "Yoshua Bengio and R{\'e}jean Ducharme and Pascal Vincent
+                 and Christian Jauvin",
+  title =        "A Neural Probabilistic Language Model",
+  journal =      jmlr,
+  volume =       "3",
+  pages =        "1137--1155",
+  year =         "2003",
+}
+
+@Article{Bengio-nnlm2003-small,
+  author =       "Y. Bengio and R. Ducharme and P. Vincent
+                 and C. Jauvin",
+  title =        "A Neural Probabilistic Language Model",
+  journal =      "JMLR",
+  volume =       "3",
+  pages =        "1137--1155",
+  year =         "2003",
+}
+
+@Article{Bengio-NonStat-Hyper-ML,
+  author =       "Yoshua Bengio and Charles Dugas",
+  title =        "Learning Simple Non-Stationarities with
+                 Hyper-Parameters",
+  journal =      "submitted to Machine Learning",
+  year =         "1999",
+}
+
+@Article{Bengio-prel92,
+  author =       "Y. Bengio and M. Gori and R. \mbox{De Mori}",
+  title =        "Learning the Dynamic Nature of Speech with
+                 Back-propagation for Sequences",
+  journal =      prel,
+  volume =       "13",
+  number =       "5",
+  pages =        "375--385",
+  year =         "1992",
+  note =         "(Special issue on Artificial Neural Networks)",
+}
+
+@Article{Bengio-2008,
+  author =       "Yoshua Bengio",
+  title =        "Learning Deep Architectures for {AI}",
+  journal =  {Foundations and Trends in Machine Learning},
+  year =         "2009",
+  volume = {to appear},
+}
+
+@Article{Bengio-2009-short,
+  author =       "Y. Bengio",
+  title =        "Learning Deep Architectures for {AI}",
+  journal =  {Foundations \& Trends in Mach. Learn.},
+  year =         "2009",
+  volume = 2,
+  number = 1,
+  pages = {1--127},
+}
+
+@TechReport{Bengio-TR1312-small,
+  author =       "Yoshua Bengio",
+  title =        "Learning Deep Architectures for {AI}",
+  number =       "1312",
+  institution =  "U. Montr\'eal, dept. IRO",
+  year =         "2007",
+}
+
+@InProceedings{Bengio-transducers-98,
+  author =       "Y. Bengio and S. Bengio and J. F. Isabelle and Y.
+                 Singer",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "Shared Context Probabilistic Transducers",
+  publisher =    "MIT Press",
+  pages =        "409--415",
+  year =         "1998",
+}
+
+@Article{Bengio-trnn92,
+  author =       "Y. Bengio and R. \mbox{De Mori} and G. Flammia and R.
+                 Kompe",
+  title =        "Global Optimization of a Neural Network-Hidden
+                 {Markov} Model Hybrid",
+  journal =      ieeetrnn,
+  volume =       "3",
+  number =       "2",
+  pages =        "252--259",
+  year =         "1992",
+}
+
+@Article{Bengio-trnn93,
+  author =       "Y. Bengio and P. Simard and P. Frasconi",
+  title =        "Learning Long-Term Dependencies with Gradient Descent
+                 is Difficult",
+  journal =      ieeetrnn,
+  volume =       "5",
+  number =       "2",
+  pages =        "157--166",
+  year =         "1994",
+  OPTnote =      "(Special Issue on Recurrent Neural Networks)",
+  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/ieeetrnn94.pdf",
+}
+
+@Article{Bengio-trnn96,
+  author =       "Y. Bengio and P. Frasconi",
+  title =        "Input/{Output} {HMM}s for Sequence Processing",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "7",
+  number =       "5",
+  pages =        "1231--1249",
+  year =         "1996",
+}
+
+@TechReport{Bengio2003,
+  author =       "Christopher Kermorvant and Yoshua Bengio",
+  title =        "Extracting Hidden Sense Probabilities from Bitexts",
+  number =       "1231",
+  institution =  "Université de Montréal",
+  year =         "2003",
+}
+
+@InProceedings{Bengio89b,
+  author =       "Y. Bengio and P. Cosi and R. Cardin and R. De Mori",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Use of multi-layered networks for coding speech with
+                 phonetic features",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "224--231",
+  year =         "1989",
+}
+
+@PhdThesis{Bengio91,
+  author =       "Yoshua Bengio",
+  title =        "Artificial Neural Networks and their Application to
+                 Sequence Recognition",
+  school =       "McGill University, (Computer Science)",
+  address =      "Montreal, Qc., Canada",
+  year =         "1991",
+}
+
+@InProceedings{bengio91x,
+  author =       "Y. Bengio and R. {De Mori} and G. Flammia and R.
+                 Kompe",
+  booktitle =    ijcnn,
+  title =        "Global Optimization of a Neural Network - Hidden
+                 Markov Model Hybrid",
+  volume =       "2",
+  pages =        "789--794",
+  year =         "1991",
+  OPTaddress =   "Seattle WA",
+}
+
+@article{Becker92,
+ author = {Sue Becker and Geoffrey Hinton},
+ title =        {A self-organizing neural network that discovers surfaces in random-dot stereograms},
+ journal = {Nature},
+ volume = 355,
+ pages = {161--163},
+ year = 1992,
+}
+ 
+@Article{Bengio93,
+  author =       "Yoshua Bengio",
+  title =        "A Connectionist Approach to Speech Recognition",
+  journal =      "International Journal on Pattern Recognition and
+                 Artificial Intelligence",
+  volume =       "7",
+  number =       "4",
+  pages =        "647--668",
+  note =         "special issue entitled Advances in Pattern Recognition Systems using Neural Networks",
+  year =         "1993",
+}
+
+@InProceedings{Bengio93e,
+  author =       "S. Bengio and Y. Bengio and J. Cloutier and J.
+                 Gecsei",
+  editor =       "S. Gielen and B. Kappen",
+  booktitle =    "Proceedings of the International Conference on
+                 Artificial Neural Networks 1993",
+  title =        "Generalization of a Parametric Learning Rule",
+  publisher =    "Springer-Verlag",
+  address =      "Amsterdam, The Netherlands",
+  pages =        "502--502",
+  year =         "1993",
+}
+
+@Article{bengio:1999:nc,
+  author =       "S. Bengio and Y. Bengio and J. Robert and G.
+                 B\'elanger",
+  title =        "Stochastic Learning of Strategic Equilibria for
+                 Auctions",
+  journal =      "Neural Computation",
+  volume =       "11",
+  number =       "5",
+  pages =        "1199--1209",
+  year =         "1999",
+}
+
+@Article{bottou+al:1999,
+  author =       "L. Bottou and P. Haffner and P.G. Howard and P. Simard and Y. Bengio",
+  title =        "High quality document image compression with {DjVu}",
+  journal =      "Journal of Electronic Imaging",
+  volume =       "7",
+  number =       "3",
+  pages =        "410--425",
+  year =         "1998",
+}
+
+@Article{bengio+al:1998,
+  author =       "Y. Bengio and F. Gingras and B. Goulard and J.-M. Lina",
+  title =        "Gaussian Mixture Densities for Classification of Nuclear Power Plant Data",
+  journal =      "Computers and Artificial Intelligence, special issue on Intelligent Technologies for Electric and Nuclear Power Plants",
+  volume =       "17",
+  number =       "2--3",
+  pages =        "189--209",
+  year =         "1998",
+}
+
+@Article{GingrasBengio:1998,
+  author =       "F. Gingras and Y. Bengio",
+  title =        "Handling Asynchronous or Missing Financial Data with Recurrent Networks",
+  journal =      "International Journal of Computational Intelligence and Organizations",
+  volume =       "1",
+  number =       "3",
+  pages =        "154--163",
+  year =         "1998",
+}
+
+@Article{BengioS95,
+  author =       "S. Bengio and Y. Bengio and J. Cloutier",
+  title =        "On the search for new learning rules for {ANN}s",
+  journal =      "Neural Processing Letters",
+  volume =       "2",
+  number =       "4",
+  pages =        "26--30",
+  year =         "1995",
+}
+
+@Article{BengioMori89,
+  author =       "Y. Bengio and R. De Mori",
+  title =        "Use of multilayer networks for the recognition of phonetic features and phonemes",
+  journal =      "Computational Intelligence",
+  volume =       "5",
+  pages =        "134--141",
+  year =         "1989",
+}
+
+@TechReport{BengioTR1178,
+  author =       "Yoshua Bengio and R\'ejean Ducharme and Pascal
+                 Vincent",
+  title =        "A Neural Probabilistic Language Model",
+  number =       "1178",
+  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
+  year =         "2002",
+}
+
+@TechReport{BengioTR1215,
+  author =       "Yoshua Bengio",
+  title =        "New Distributed Probabilistic Language Models",
+  number =       "1215",
+  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
+  year =         "2002",
+}
+
+@Book{Bengio_book96,
+  author =       "Yoshua Bengio",
+  title =        "Neural Networks for Speech and Sequence Processing",
+  publisher =    "International Thomson Computer Press",
+  year =         "1996",
+}
+
+@InProceedings{Bengio_icnn93,
+  author =       "Y. Bengio and P. Frasconi and P. Simard",
+  booktitle =    icnn,
+  title =        "The problem of learning long-term dependencies in
+                 recurrent networks",
+  publisher =    "IEEE Press",
+  address =      "San Francisco",
+  pages =        "1183--1195",
+  year =         "1993",
+  note =         "(invited paper)",
+}
+
+@Article{Bengio_trnn94,
+  author =       "Y. Bengio and P. Simard and P. Frasconi",
+  title =        "Learning Long-Term Dependencies with Gradient Descent
+                 is Difficult",
+  journal =      ieeetrnn,
+  volume =       "5",
+  number =       "2",
+  pages =        "157--166",
+  year =         "1994",
+  note =         "Special Issue on Recurrent Neural Networks, March 94",
+}
+
+@Book{Benveniste90,
+  author =       "A. Benveniste and M. Metivier and P. Priouret",
+  title =        "Adaptive Algorithms and Stochastic Approximations",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin, New York",
+  year =         "1990",
+}
+
+@Book{Berger85,
+  author =       "J. Berger",
+  title =        "Statistical Decision Theory and {Bayesian} Analysis",
+  publisher =    "Springer",
+  year =         "1985",
+}
+
+@Misc{berger97improved,
+  author =       "A. Berger",
+  title =        "The improved iterative scaling algorithm: {A} gentle
+                 introduction",
+  year =         "1997",
+  URL =          "citeseer.ist.psu.edu/berger97improved.html",
+  text =         "Berger, A. (1997). The improved iterative scaling
+                 algorithm: A gentle introduction.
+                 http://www.cs.cmu.edu/afs/cs/user/aberger/www/ps/scaling.ps.",
+}
+
+@article{Berkes-Wiskott-2005,
+    author = {Berkes, Pietro and Wiskott, Laurenz},
+    title = {Slow Feature Analysis Yields a Rich Repertoire of Complex Cell Properties},
+    journal = {Journal of Vision},
+    ISSN = {1534-7362},
+    volume = {5},
+    number = {6},
+    pages = {579-602},
+    year = {2005},
+    month = {7}
+}
+
+@Article{Beurle56,
+  author =       "R. L. Beurle",
+  title =        "Properties of a Mass of Cells Capable of Regenerating
+                 Pulses",
+  journal =      PTRSL,
+  volume =       "240",
+  pages =        "55--94",
+  year =         "1956",
+}
+
+@InProceedings{Beyer+al-1999,
+  author =       "Kevin S. Beyer and Jonathan Goldstein and Raghu Ramakrishnan
+                 and Uri Shaft",
+  booktitle =    "Proceeding of the 7th International Conference on
+                 Database Theory",
+  title =        "When Is ``Nearest Neighbor'' Meaningful?",
+  publisher =    "Springer-Verlag",
+  pages =        "217--235",
+  year =         "1999",
+  ISBN =         "3-540-65452-6",
+}
+
+@TechReport{Bianchini-rbf,
+  author =       "M. Bianchini and P. Frasconi and M. Gori",
+  title =        "Learning without Local Minima in Radial Basis Function
+                 Networks",
+  institution =  "Universit\`a di Firenze",
+  year =         "1992",
+  OPTannote =    "",
+}
+
+@Article{Bianchini-trnn94,
+  author =       "M. Bianchini and M. Gori and M. Maggini",
+  title =        "On the Problem of Local Minima in Recurrent Neural
+                 Networks",
+  journal =      ieeetrnn,
+  volume =       "5",
+  number =       "2",
+  pages =        "167--177",
+  year =         "1994",
+  OPTnote =      "(Special Issue on Recurrent Neural Networks)",
+}
+
+@TechReport{bickel+ritov95,
+  author =       "P. J. Bickel and Y. Ritov",
+  title =        "Inference in hidden {Markov} models {I}: local
+                 asymptotic normality in the stationary case",
+  number =       "Technical Report 383",
+  institution =  "Statistics Department, University of California,
+                 Berkeley",
+  year =         "February 1994, revised April 1995",
+}
+
+@Article{Bienenstock82,
+  author =       "E. L. Bienenstock and L. N. Cooper and P. W. Munro",
+  title =        "Theory for the Development of Neuron Selectivity:
+                 Orientation Specificity and Binocular Interaction in
+                 Visual Cortex",
+  journal =      jneuro,
+  volume =       "2",
+  year =         "1982",
+}
+
+@Article{BierdermanI1987,
+  author =       "Irving Bierderman",
+  title =        "Recognition-by-Components: {A} Theory of Human Image
+                 Understanding",
+  journal =      "Psychological Review",
+  volume =       "94",
+  number =       "2",
+  publisher =    "American Psychological Association, Inc.",
+  pages =        "115--147",
+  year =         "1987",
+  added-by =     "Daniel Acevedo",
+  date-added =   "Thu Oct 24 12:45:17 2002",
+  project =      "genetic",
+  theme =        "perception and vr and tech and natural and medicine
+                 and art",
+}
+
+@InProceedings{Bilbro89a,
+  author =       "G. Bilbro and R. Mann and T. K. Miller and W. E.
+                 Snyder and D. E. Van den Bout and M. White",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Optimization by Mean Field Annealing",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "91--98",
+  year =         "1989",
+}
+
+@InProceedings{Bilbro89b,
+  author =       "G. L. Bilbro and W. Snyder",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Range Image Restoration Using Mean Field Annealing",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "594--601",
+  year =         "1989",
+}
+
+@Article{Binder86,
+  author =       "K. Binder and A. P. Young",
+  title =        "Spin Glasses: Experimental Facts, Theoretical
+                 Concepts, and Open Questions",
+  journal =      rmp,
+  volume =       "58",
+  pages =        "801--976",
+  year =         "1986",
+}
+
+@Book{Binder88,
+  author =       "K. Binder and D. W. Heerman",
+  title =        "Monte Carlo Simulation in Statistical Mechanics",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  year =         "1988",
+}
+
+@Book{bishop-book2006,
+  author =       "Christopher M. Bishop",
+  title =        "Pattern Recognition and Machine Learning",
+  publisher =    "Springer",
+  year =         "2006",
+}
+
+@Book{bishop-book95,
+  author =       "Christopher Bishop",
+  title =        "Neural Networks for Pattern Recognition",
+  publisher =    "Oxford University Press",
+  address =      "London, UK",
+  year =         "1995",
+}
+
+@Article{bishop92,
+  author =       "Christopher Bishop",
+  title =        "Exact calculation of the {Hessian} matrix for the
+                 multi-layer perceptron",
+  journal =      "Neural Computation",
+  volume =       "4",
+  number =       "4",
+  pages =        "494--501",
+  year =         "1992",
+}
+
+@Article{bishop95training,
+  author =       "Christopher M. Bishop",
+  title =        "Training with Noise is Equivalent to {Tikhonov}
+                 Regularization",
+  journal =      "Neural Computation",
+  volume =       "7",
+  number =       "1",
+  pages =        "108--116",
+  year =         "1995",
+}
+
+@Article{Blackscholes73,
+  author =       "F. Black and M. Scholes",
+  title =        "The Pricing of Options and Corporate Liabilities",
+  journal =      "Journal of Political Economy",
+  number =       "81",
+  pages =        "637--654",
+  year =         "1973",
+}
+
+@Article{Blakemore70,
+  author =       "C. Blakemore and G. F. Cooper",
+  title =        "Development of the Brain Depends on the Visual
+                 Environment",
+  journal =      nature,
+  volume =       "228",
+  pages =        "477--478",
+  year =         "1970",
+}
+
+@InCollection{Blitzer-nips17,
+  author =       "John Blitzer and Kilian Weinberger and Lawrence Saul
+                 and Fernando Pereira",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Hierarchical Distributed Representations for
+                 Statistical Language Modeling",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2005",
+}
+
+@InProceedings{Blitzer05,
+  author =       "John Blitzer and Kilian Weinberger and Lawrence Saul
+                 and Fernando Pereira",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Hierarchical Distributed Representations for
+                 Statistical Language Modeling",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2005",
+}
+
+@InProceedings{Blitzer2005,
+  author =       "J. Blitzer and K. Q. Weinberger and L. K. Saul and F.
+                 C. N. Pereira",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Hierarchical distributed representations for
+                 statistical language models",
+  publisher =    "{MIT} Press",
+  year =         "2005",
+}
+
+@Article{Block62,
+  author =       "H. D. Block",
+  title =        "The Perceptron: {A} Model for Brain Functioning",
+  journal =      rmp,
+  volume =       "34",
+  year =         "1962",
+}
+
+@InProceedings{Blum+Rivest,
+  author =       "A. Blum and R. L. Rivest",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Training a 3-node Neural Net is {NP}-Complete",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "494--501",
+  year =         "1989",
+}
+
+@InProceedings{blum01learning,
+  author =       "Avrim Blum and Shuchi Chawla",
+  booktitle =    "Proc. 18th International Conf. on Machine Learning",
+  title =        "Learning from Labeled and Unlabeled Data Using Graph
+                 Mincuts",
+  publisher =    "Morgan Kaufmann, San Francisco, CA",
+  pages =        "19--26",
+  year =         "2001",
+}
+  %URL =          "citeseer.ist.psu.edu/blum01learning.html",
+
+@InProceedings{blum98combining,
+  author =       "Avrim Blum and Tom Mitchell",
+  booktitle =    colt98,
+  publisher =    "Morgan Kaufmann Publishers",
+  title =        "Combining Labeled and Unlabeled Data with
+                 Co-training",
+  pages =        "92--100",
+  year =         "1998",
+}
+  %URL =          "citeseer.ist.psu.edu/blum98combining.html",
+
+@InProceedings{blum98combining-small,
+  author =       "Avrim Blum and Tom Mitchell",
+  booktitle =    "COLT'98",
+  title =        "Combining Labeled and Unlabeled Data with
+                 Co-training",
+  pages =        "92--100",
+  year =         "1998",
+}
+  %URL =          "citeseer.ist.psu.edu/blum98combining.html",
+
+@InProceedings{blum99,
+  author =       "A. Blum and A. Kalai and J. Langford",
+  booktitle =    colt99,
+  title =        "Beating the hold-out: Bounds for k-fold and
+                 progressive cross-validation",
+  pages =        "",
+  year =         "1999",
+}
+
+@InProceedings{Blumer86,
+  author =       "A. Blumer and A. Ehrenfeucht and D. Haussler and M.
+                 Warmuth",
+  booktitle =    "Proceedings of the Eighteenth Annual ACM Symposium on
+                 Theory of Computing",
+  title =        "Classifying Learnable Geometric Concepts with the
+                 Vapnik-Chervonenkis Dimension",
+  publisher =    "ACM, Salem",
+  address =      "Berkeley 1986",
+  pages =        "273--282",
+  year =         "1986",
+}
+
+@Article{Blumer87,
+  author =       "A. Blumer and A. Ehrenfeucht and D. Haussler and M.
+                 Warmuth",
+  title =        "Occam's razor",
+  journal =      "Inf. Proc. Let.",
+  volume =       "24",
+  pages =        "377--380",
+  year =         "1987",
+}
+
+@Article{Blumstein79,
+  author =       "S. E. Blumstein and K. N. Stevens",
+  title =        "Acoustic invariance in speech production: Evidence
+                 from measurements of the spectral characteristics of
+                 stop consonants",
+  journal =      "Journal of the Acoustical Society of America",
+  volume =       "66",
+  number =       "4",
+  pages =        "1001--1018",
+  year =         "1979",
+}
+
+@Article{Bohm96,
+  author =       "G. Bohm",
+  title =        "New approaches in molecular structure prediction",
+  journal =      "Biophys. Chem.",
+  volume =       "59",
+  pages =        "1--32",
+  year =         "1996",
+}
+
+@Article{Bohr88,
+  author =       "H. Bohr and J. Bohr and S. Brunak and R. M. J.
+                 Cotterill and B. Lautrup and L. Norskov and O. H.
+                 Olsen and S. B. Petersen",
+  title =        "Protein Secondary Structure and Homology by Neural
+                 Networks: The $\alpha$-Helices in Rhodopsin",
+  journal =      febsl,
+  volume =       "241",
+  pages =        "223--228",
+  year =         "1988",
+}
+
+@InProceedings{bollacker98,
+  author =       "Kurt D. Bollacker and Joydeep Ghosh",
+  booktitle =    ICML98,
+  editor =       ICML98ed,
+  publisher =    ICML98publ,
+  title =        "A Supra-Classifier Architecture for Scalable Knowledge
+                 Reuse",
+  address =      "San Francisco, CA, USA",
+  pages =        "64--72",
+  year =         "1998",
+}
+
+@InProceedings{BonillaE2007,
+  author =       "Edwin V. Bonilla and Felix V. Agakov and Christopher
+                 K. I. Williams",
+  booktitle =    "Proceedings of AISTATS 2007",
+  title =        "Kernel Multi-task Learning using Task-specific
+                 Features",
+  year =         "2007",
+}
+
+@Article{Bonomo94,
+  author =       "M. Bonomo and R. Garcia",
+  title =        "Can a well-fitted equilibrium asset-pricing model
+                 produce mean reversion?",
+  journal =      "Journal of Applied Econometrics",
+  volume =       "9",
+  pages =        "19--29",
+  year =         "1994",
+}
+
+@Article{bordes-09,
+  author =  {Bordes, Antoine and Bottou, L\'eon and Gallinari, Patrick},
+  title =   {SGD-QN: Careful Quasi-Newton Stochastic Gradient Descent},
+  journal = {Journal of Machine Learning Research},
+  year =    {2009},
+  volume =  {10},
+  pages =   {1737-1754},
+  month =   {July},
+}
+
+@Book{Bornstein-critical-87,
+		author = { Bornstein, Marc H. },
+		title = { Sensitive periods in development : interdisciplinary
+				perspectives / edited by Marc H. Bornstein },
+		publisher = { Lawrence Erlbaum Associates, Hillsdale, N.J. : },
+		year = { 1987 },
+		type = { Book },
+}
+
+
+@Article{boser-92,
+  author =       "B. Boser and E. Sackinger and J. Bromley and Y. {LeCun}
+                 and L. Jackel",
+  title =        "An analog neural network processor with programmable
+                 topology",
+  journal =      "IEEE Journal of Solid-State Circuits",
+  volume =       "26",
+  number =       "12",
+  pages =        "2017--2025",
+  month =        dec,
+  year =         "1991",
+}
+
+@InProceedings{Boser92,
+  author =       "Bernhard E. Boser and Isabelle M. Guyon and Vladimir N. Vapnik",
+  booktitle =    "Fifth Annual Workshop on Computational Learning
+                 Theory",
+  title =        "A training algorithm for optimal margin classifiers",
+  publisher =    "ACM",
+  address =      "Pittsburgh",
+  pages =        "144--152",
+  year =         "1992",
+  doi =          {http://doi.acm.org/10.1145/130385.130401},
+  isbn = {0-89791-497-X},
+}
+
+@incollection{bottou-bousquet-2008,
+  author = {Bottou, L\'{e}on and Bousquet, Olivier},
+  title = {The Tradeoffs of Large Scale Learning},
+  editor = NIPS20ed,
+  booktitle = NIPS20,
+  publisher = {MIT Press},
+  year = {2008},
+  volume = {20},
+  address = {Cambridge, MA},
+  url = "http://leon.bottou.org/papers/bottou-bousquet-2008",
+}
+
+@TechReport{Bottou+96,
+  author =       "L{\'e}on Bottou and Yoshua Bengio and Yann A. {Le Cun}",
+  title =        "Document Analysis with Generalized Transduction",
+  number =       "HA6156000-960701-01TM",
+  institution =  "AT\&T Laboratories",
+  address =      "Holmdel, New-Jersey",
+  month =        jul,
+  year =         "1996",
+}
+
+@Article{Bottou+LeCun05,
+  author =       "L{\'e}on Bottou and Yann {LeCun}",
+  title =        "Graph Transformer Networks for Image Recognition",
+  journal =      "Bulletin of the International Statistical Institute",
+  year =         "2005",
+}
+
+@TechReport{bottou-1996a,
+  author =       "L{\'{e}}on Bottou and Yoshua Bengio and Yann {Le Cun}",
+  title =        "Document Analysis with Transducers",
+  number =       "{960701}-{01}-{TM}",
+  institution =  "AT\&T Labs Technical Memorandum",
+  month =        jun,
+  year =         "1996",
+}
+
+@InProceedings{bottou-lecun-04b,
+  author =       "Leon Bottou and Yann {LeCun}",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Large-Scale On-Line Learning",
+  publisher =    "MIT Press",
+  year =         "2004",
+  original =     "orig/bottou-lecun-04b.ps.gz",
+}
+
+@InCollection{bottou-mlss-2004,
+  author =       "L\'{e}on Bottou",
+  editor =       "Olivier Bousquet and Ulrike von Luxburg",
+  booktitle =    "Advanced Lectures on Machine Learning",
+  title =        "Stochastic Learning",
+  number =       "LNAI 3176",
+  publisher =    "Springer Verlag",
+  address =      "Berlin",
+  pages =        "146--168",
+  year =         "2004",
+  series =       "Lecture Notes in Artificial Intelligence",
+  URL =          "http://leon.bottou.org/papers/bottou-mlss-2004",
+}
+
+@Article{Bottou90,
+  author =       "L. Bottou and F. Fogelman-Souli\'e and P. Blanchet and
+                 J. S. Lienard",
+  key =          "bottou",
+  title =        "Speaker independent isolated digit recognition:
+                 multilayer perceptrons vs dynamic time warping",
+  journal =      "Neural Networks",
+  volume =       "3",
+  pages =        "453--465",
+  year =         "1990",
+}
+
+@InProceedings{Bottou91,
+  author =       "L. Bottou and P. Gallinari",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "A Framework for the Cooperation of Learning
+                 Algorithms",
+  address =      "Denver, CO",
+  pages =        "781--788",
+  year =         "1991",
+}
+
+@Article{Bottou92,
+  author =       "L. Bottou and V. Vapnik",
+  key =          "Bottou92",
+  title =        "Local Learning Algorithms",
+  journal =      nc,
+  volume =       "4",
+  number =       "6",
+  pages =        "888--900",
+  year =         "1992",
+}
+
+@InProceedings{Bottou94,
+  author =       "L. Bottou and C. Cortes and J. S. Denker and H.
+                 Drucker and I. Guyon and L. D. Jackel and Y. {LeCun} and
+                 U. A. Muller and E. Sackinger and P. Simard and V.
+                 Vapnik",
+  booktitle =    "International Conference on Pattern Recognition",
+  title =        "Comparison of classifier methods: a case study in
+                 handwritten digit recognition",
+  address =      "Jerusalem, Israel",
+  year =         "1994",
+}
+
+@InProceedings{Bottou97,
+  author =       "L{\'e}on Bottou and Yoshua Bengio and Yann {LeCun}",
+  booktitle =    cvpr97,
+  title =        "Global Training of Document Processing Systems using
+                 Graph Transformer Networks",
+  publisher =    "IEEE",
+  address =      "Puerto Rico",
+  pages =        "490--494",
+  year =         "1997",
+}
+
+@InCollection{Bottou98,
+  author =       "L{\'e}on Bottou",
+  editor =       "David Saad",
+  booktitle =    "Online Learning in Neural Networks",
+  title =        "Online Algorithms and Stochastic Approximations",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge, UK",
+  pages =        "",
+  year =         "1998",
+}
+
+@PhdThesis{Bottou_these91,
+  author =       "L\'eon Bottou",
+  title =        "Une approche th\'eorique de l'apprentissage
+                 connexioniste; applications \`a la reconnaissance de la
+                 parole",
+  school =       "Universit\'e de Paris XI",
+  year =         "1991",
+}
+
+@InProceedings{BouchardG2004,
+  author =       "Guillaume Bouchard and Bill Triggs",
+  booktitle =    "IASC International Symposium on Computational
+                 Statistics (COMPSTAT)",
+  title =        "The Tradeoff Between Generative and Discriminative
+                 Classifiers",
+  address =      "Prague",
+  pages =        "721--728",
+  month =        aug,
+  year =         "2004",
+  keywords =     "LEAR, LAVA",
+}
+  %URL =          "http://lear.inrialpes.fr/pubs/2004/BT04",
+
+@inproceedings{BouchardG2007,
+ author = {Guillaume Bouchard},
+ title = {Bias-Variance Tradeoff in Hybrid Generative-Discriminative Models},
+ booktitle = ICML07,
+ editor =    ICML07ed,
+ publisher = ICML07publ,
+ year = {2007},
+ isbn = {0-7695-3069-9},
+ pages = {124--129},
+ address = {Washington, DC, USA},
+ }
+ %doi = {http://dx.doi.org/10.1109/ICMLA.2007.23},
+
+@Article{Bourlard-cspla89,
+  author =       "H. Bourlard and C. Wellekens",
+  title =        "Speech Pattern Discrimination and Multi-Layered
+                 Perceptrons",
+  journal =      cspla,
+  volume =       "3",
+  pages =        "1--19",
+  year =         "1989",
+}
+
+@Article{Bourlard-pami90,
+  author =       "H. Bourlard and C. Wellekens",
+  title =        "Links between Hidden {Markov} Models and Multilayer
+                 Perceptrons",
+  journal =      ieeetpami,
+  volume =       "12",
+  pages =        "1167--1178",
+  year =         "1990",
+}
+
+@Article{Bourlard88,
+  author =       "H. Bourlard and Y. Kamp",
+  title =        "Auto-Association by Multilayer Perceptrons and
+                 Singular Value Decomposition",
+  journal =      biocyb,
+  volume =       "59",
+  pages =        "291--294",
+  year =         "1988",
+}
+
+@Book{Bourlard93,
+  author =       "H. Bourlard and N. Morgan",
+  title =        "Connectionist Speech Recognition. {A} Hybrid
+                 Approach",
+  volume =       "247",
+  publisher =    "Kluwer Academic Publishers",
+  address =      "Boston",
+  year =         "1993",
+  series =       "The Kluwer international series in engineering and
+                 computer science",
+}
+
+@Article{Bourlard_cspla89,
+  author =       "H Bourlard and C. Wellekens",
+  title =        "Speech Pattern Discrimination and Multi-Layered
+                 Perceptrons",
+  journal =      cspla,
+  volume =       "3",
+  pages =        "1--19",
+  year =         "1989",
+  OPTnote =      "",
+}
+
+@InCollection{Bourrely89,
+  author =       "J. Bourrely",
+  booktitle =    "Hypercube and distributed computers",
+  title =        "Parallelization of a Neural Learning Algorithm on a
+                 Hypercube",
+  publisher =    "Elsiever Science Publishing, North Holland",
+  pages =        "219--229",
+  year =         "1989",
+}
+
+@inproceedings{Bouveyron-Chipman-2007,
+ author = {C. Bouveyron and H. Chipman},
+ title = {Visualization and classification of graph-structured data: the case of the {E}nron dataset}, 
+ booktitle = ijcnn,
+ pages = {1506--1511}, 
+ year = 2007,
+}
+
+@Book{Box73,
+  author =       "G. E. P. Box and G. C. Tiao",
+  title =        "Bayesian inference in statistical analysis",
+  publisher =    "Addison-Wesley",
+  year =         "1973",
+}
+
+@Book{BoxJenkins,
+  author =       "G. E. P. Box and G. M. Jenkins",
+  title =        "Time Series Analysis: Forecasting and Control.",
+  publisher =    "Holden-Day",
+  address =      "San Francisco",
+  year =         "1970",
+}
+
+@Book{Boyd04,
+  author =       "Stephen Boyd and Lieven Vandenberghe",
+  title =        "Convex Optimization",
+  publisher =    "Cambridge University Press",
+  address =      "New York, NY, USA",
+  year =         "2004",
+  ISBN =         "0-521-83378-7",
+}
+
+@incollection{Bradley+Bagnell-2009,
+ title = {Differentiable Sparse Coding},
+ author = {J. Andrew Bagnell and David M. Bradley},
+ editor =       NIPS21ed,
+ booktitle =    NIPS21,
+ pages = {},
+ publisher = {NIPS Foundation},
+ year = {2009}
+}
+
+@PhdThesis{Bradley-thesis,
+  author =       "David Bradley",
+  title =        "Learning in Modular Systems",
+  school =       "The Robotics Institute, Carnegie Mellon University",
+  year =         "2009",
+}
+
+@Article{Brady-ieeecas89,
+  author =       "M. L. Brady and R. Raghavan and J. Slawny",
+  title =        "Back-Propagation Fails to Separate Where Perceptrons
+                 Succeed",
+  journal =      ieeetcas,
+  volume =       "36",
+  pages =        "665--674",
+  year =         "1989",
+}
+
+@Article{Brady89,
+  author =       "M. L. Brady and R. Raghavan and J. Slawny",
+  title =        "Back-Propagation fails to Separate Where Perceptrons
+                 Succeed",
+  journal =      "IEEE Transactions on Circuits and Systems",
+  volume =       "36",
+  number =       "5",
+  pages =        "665--674",
+  year =         "1989",
+}
+
+@InProceedings{Bramson90,
+  author =       "M. J. Bramson and R. G. Hoptroff",
+  booktitle =    "Workshop on Neural Networks for Statistical and
+                 Economic Data",
+  title =        "Forecasting the Economic Cycle: a Neural Network
+                 Approach",
+  address =      "Dublin",
+  year =         "1990",
+}
+
+@InProceedings{Brand2003,
+  author =       "M. Brand",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Charting a manifold",
+  publisher =    "{MIT} Press",
+  pages =        "961--968",
+  year =         "2003",
+}
+
+@Article{Brand99,
+  author =       "Matthew Brand",
+  title =        "Structure Learning in Conditional Probability Models
+                 via an Entropic Prior and Parameter Extinction",
+  journal =      "Neural Computation",
+  volume =       "11",
+  number =       "5",
+  pages =        "1155--1182",
+  year =         "1999",
+}
+
+@InProceedings{Brandt88,
+  author =       "R. D. Brandt and Y. Wang and A. J. Laub and S. K.
+                 Mitra",
+  booktitle =    icnn,
+  title =        "Alternative Networks for Solving the Travelling
+                 Salesman Problem and the List-Matching Problem",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "333--340",
+  year =         "1988",
+}
+
+@inproceedings{BreglerC1994,
+    author = "Christoph Bregler and Stephen M. Omohundro",
+    editor =       NIPS6ed,
+    booktitle =    NIPS6,
+    title = "Surface Learning with Applications to Lipreading",
+    publisher = "Morgan Kaufmann Publishers, Inc.",
+    pages = "43--50",
+    year = "1994",
+}
+
+
+@Article{Breiman-96,
+  author =       "L. Breiman",
+  title =        "Heuristics of instability and stabilization in model
+                 selection",
+  journal =      "The Annals of Statistics",
+  volume =       "24",
+  number =       "6",
+  pages =        "2350--2383",
+  year =         "1996",
+}
+
+@Article{breiman-stability-96,
+  author =       "L. Breiman",
+  title =        "Heuristics of Instability and Stabilization in Model
+                 Selection",
+  journal =      "Annals of Statistics",
+  volume =       "24",
+  number =       "6",
+  pages =        "2350--2383",
+  year =         "1996",
+}
+
+@Article{Breiman01,
+  author =       "Leo Breiman",
+  title =        "Random Forests",
+  journal =      "Machine Learning",
+  volume =       "45",
+  number =       "1",
+  pages =        "5--32",
+  year =         "2001",
+}
+
+@Book{Breiman84,
+  author =       "L. Breiman and J. H. Friedman and R. A. Olshen and C.
+                 J. Stone",
+  title =        "Classification and Regression Trees",
+  publisher =    "Wadsworth International Group",
+  address =      "Belmont, CA",
+  year =         "1984",
+}
+
+@TechReport{Breiman96,
+  author =       "L. Breiman",
+  title =        "Bias, Variance, and Arcing Classifiers",
+  number =       "Technical Report 460",
+  institution =  "Statistics Department, University of California",
+  address =      "Berkeley, CA 94720",
+  month =        apr,
+  year =         "1996",
+}
+
+@InCollection{Bridle+Cox91,
+  author =       "J. S. Bridle and S. J. Cox",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "{RECNORM}: simultaneous normalisation and
+                 classification applied to speech recognition",
+  publisher =    "Morgan Kaufmann",
+  pages =        "234--240",
+  year =         "1991",
+}
+
+@InCollection{Bridle89,
+  author =       "J. Bridle",
+  editor =       "F. Fogelman-Souli\'e and J. {H\'{e}rault}",
+  booktitle =    "Neuro-computing: Algorithms, Architectures, and
+                 Applications",
+  title =        "Probabilistic interpretation of feedforward
+                 classification network outputs, with relationships to
+                 statistical pattern recognition",
+  publisher =    "Springer-Verlag",
+  address =      "New York",
+  year =         "1989",
+}
+
+@InCollection{Bridle89-nips,
+  author =       "J. S. Bridle",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Training Stochastic Model Recognition Algorithms as
+                 Networks can lead to Maximum Mutual Information
+                 Estimation of Parameters",
+  publisher =    "Morgan Kaufmann",
+  pages =        "211--217",
+  year =         "1990",
+}
+
+@Article{Bridle90,
+  author =       "J. S. Bridle",
+  title =        "Alphanets: a Recurrent `Neural' Network Architecture
+                 with a Hidden {Markov} Model Interpretation",
+  journal =      spcomm,
+  volume =       "9",
+  number =       "1",
+  pages =        "83--92",
+  year =         "1990",
+}
+
+@InCollection{Bridle90b,
+  author =       "J. S. Bridle",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Training Stochastic Model Recognition Algorithms as
+                 Networks can lead to Maximum Mutual Information
+                 Estimation of Parameters",
+  publisher =    "Morgan Kaufmann",
+  pages =        "211--217",
+  year =         "1990",
+}
+
+@InCollection{Bromley-siamese93,
+  author =       "J. Bromley and J. Benz and L. Bottou and I. Guyon and
+                 L. Jackel and Y. {LeCun} and C. Moore and E. Sackinger
+                 and R. Shah",
+  booktitle =    "Advances in Pattern Recognition Systems using Neural
+                 Network Technologies",
+  title =        "Signature verification using a siamese time delay
+                 neural network",
+  publisher =    "World Scientific, Singapore",
+  pages =        "669--687",
+  year =         "1993",
+}
+
+@InCollection{Bromley93,
+  author =       "J. Bromley and J. Benz and L. Bottou and I. Guyon and
+                 L. Jackel and Y. {LeCun} and C. Moore and E. Sackinger
+                 and R. Shah",
+  booktitle =    "Advances in Pattern Recognition Systems using Neural
+                 Network Technologies",
+  title =        "Signature verification using a siamese time delay
+                 neural network",
+  publisher =    "Series in Machine Perception and Artificial
+                 Intelligence, World Scientific, Singapore",
+  pages =        "669--687",
+  year =         "1993",
+}
+
+@Article{broomhead-lowe-88,
+  author =       "D. Broomhead and D. Lowe",
+  key =          "Broomhead",
+  title =        "Multivariable functional interpolation and adaptive
+                 networks",
+  journal =      "Complex Systems",
+  volume =       "2",
+  pages =        "321--355",
+  year =         "1988",
+}
+
+@TechReport{Brown-Hinton-PoHMM-2000,
+  author =       "Andrew Brown and Geoffrey Hinton",
+  title =        "Products of Hidden Markov Models",
+  number =       "GCNU TR 2000-004",
+  institution =  "Gatsby Unit, University College London",
+  year =         "2000",
+}
+
+@Book{Brown86,
+  author =       "Lawrence D. Brown",
+  title =        "Fundamentals of Statistical Exponential Families",
+  volume =       "9",
+  publisher =    "Inst. of Math. Statist. Lecture Notes Monograph
+                 Series",
+  year =         "1986",
+}
+
+@Article{Brown92,
+  author =       "P. F. Brown and V. J. Della Pietra and P. V. DeSouza
+                 and J. C. Lai and R. L. Mercer",
+  title =        "Class-based {\it n}-gram models of natural language",
+  journal =      "Computational Linguistics",
+  volume =       "18",
+  pages =        "467--479",
+  year =         "1992",
+}
+
+@PhdThesis{BrownPhD,
+  author =       "P. Brown",
+  title =        "The Acoustic-Modeling problem in Automatic Speech
+                 Recognition",
+  school =       "Dept. of Computer Science, Carnegie-Mellon
+                 University",
+  year =         "1987",
+}
+
+@InProceedings{Bruce-94,
+  author =       "Rebecca Bruce and Janyce Wiebe",
+  booktitle =    "{ARPA} Workshop on Human Language Technology",
+  title =        "A new approach to sense identification",
+  address =      "Plainsboro, {NJ}",
+  year =         "1994",
+}
+
+@InProceedings{Brugnara92,
+  author =       "F. Brugnara and R. DeMori and D. Giuliani and M.
+                 Omologo",
+  booktitle =    icassp,
+  title =        "A family of parallel hidden Markov models",
+  publisher =    "IEEE",
+  address =      "New York, NY, USA",
+  pages =        "377--370",
+  year =         "1992",
+}
+
+@Article{Brunak89,
+  author =       "S. Brunak and B. Lautrup",
+  title =        "Liniedeling med et Neuralt Nev{\ae}rk",
+  journal =      SAML,
+  volume =       "14",
+  pages =        "55--74",
+  year =         "1989",
+}
+
+@Book{Brunak90,
+  author =       "S. Brunak and B. Lautrup",
+  title =        "Neural Networks: Computers with Intuition",
+  publisher =    "World Scientific",
+  address =      "Singapore",
+  year =         "1990",
+}
+
+@Article{Brunak91,
+  author =       "S. Brunak and J. Engelbrecht and S. Knudsen",
+  title =        "Prediction of human {mRNA} donor and acceptor sites
+                 from the {DNA} sequence",
+  journal =      "J. Molec. Biol.",
+  volume =       "220",
+  pages =        "49--65",
+  year =         "1991",
+}
+
+@Book{Bryson69,
+  author =       "A. E. Bryson and Y.-C. Ho",
+  title =        "Applied Optimal Control",
+  publisher =    "Blaisdell",
+  address =      "New York",
+  year =         "1969",
+}
+
+@Article{BT-the-fitting-1974,
+  author =       "A. E. Beaton and J. W. Tukey",
+  title =        "The fitting of power series, meaning polynomials,
+                 illustrted on band-spectroscopic data",
+  journal =      "Technometrics",
+  volume =       "16",
+  pages =        "147--185",
+  year =         "1974",
+}
+
+@article{Buia-Tiesinga-2006,
+ author = {Calin Buia and Paul Tiesinga},
+ title = {Attentional modulation of firing rate and synchrony in a model cortical network},
+ journal = {J. Computational Neuroscience},
+ volume = 20,
+ pages = {247--264},
+ year = 2006,
+}
+
+@TechReport{buhlmann97,
+  author =       "P. Buhlmann and A. J. Wyner",
+  title =        "Variable Length Markov Chains",
+  number =       "technical report 479",
+  institution =  "Statistics Department, University of California,
+                 Berkeley",
+  month =        jan,
+  year =         "1997",
+}
+
+@Article{Buhmann87,
+  author =       "J. Buhmann and K. Schulten",
+  title =        "Noise-Driven Temporal Association in Neural Networks",
+  journal =      eul,
+  volume =       "4",
+  pages =        "1205--1209",
+  year =         "1987",
+}
+
+@InProceedings{Buhmann88,
+  author =       "J. Buhmann and K. Schulten",
+  editor =       "R. Eckmiller and Ch. von der Malsburg",
+  booktitle =    "Neural Computers",
+  title =        "Storing Sequences of Biased Patterns in Neural
+                 Networks with Stochastic Dynamics",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Neuss 1987",
+  pages =        "231--242",
+  year =         "1988",
+}
+
+@Article{Buntine94,
+  author =       "W. Buntine",
+  title =        "Operations for Learning with Graphical Models",
+  journal =      "Journal of Artificial Intelligence Research",
+  volume =       "2",
+  pages =        "159--225",
+  year =         "1994",
+}
+
+@InProceedings{Burges92,
+  author =       "C. Burges and O. Matan and Y. {LeCun} and J. Denker and
+                 L. Jackel and C. Stenard and C. Nohl and J. Ben",
+  booktitle =    ijcnn,
+  title =        "Shortest Path Segmentation: {A} Method for Training a
+                 Neural Network to Recognize character Strings",
+  volume =       "3",
+  address =      "Baltimore",
+  pages =        "165--172",
+  year =         "1992",
+}
+
+@Article{Burges93,
+  author =       "C. J. C. Burges and J. I. Ben and J. S. Denker and Y.
+                 {LeCun} and C. R. Nohl",
+  title =        "Off Line Recognition of Handwritten Postal Words Using
+                 Neural Networks",
+  journal =      "International Journal of Pattern Recognition and
+                 Artificial Intelligence",
+  volume =       "7",
+  number =       "4",
+  pages =        "689",
+  year =         "1994",
+}
+
+@Article{burges98,
+  author =       "C. J. C. Burges",
+  title =        "A Tutorial on {Support} {Vector} {Machines} for
+                 Pattern Recognition",
+  journal =      "Data Mining and Knowledge Discovery",
+  volume =       "2",
+  number =       "2",
+  pages =        "1--47",
+  year =         "1998",
+}
+
+@InCollection{Burges99Geometry,
+  author =       "C. J. C. Burges",
+  editor =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
+  booktitle =    "Advances in Kernel Methods --- Support Vector
+                 Learning",
+  title =        "Geometry and invariance in kernel based methods",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "89--116",
+  year =         "1999",
+}
+
+@Article{Burr83,
+  author =       "D. J. Burr",
+  title =        "Designing a handwriting reader",
+  journal =      ieeetpami,
+  volume =       "5",
+  number =       "5",
+  pages =        "554--559",
+  month =        sep,
+  year =         "1983",
+}
+
+@InProceedings{Burr88,
+  author =       "D. J. Burr",
+  booktitle =    icnn,
+  title =        "An Improved Elastic Net Method for the Travelling
+                 Salesman Problem",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "69--76",
+  year =         "1988",
+}
+
+@Article{Burrows94,
+  author =       "J. H. Burrows and J. Peck",
+  title =        "On-Line Condition Monitoring of Rotating Equipment
+                 Using Neural Networks",
+  journal =      "ISA Transactions",
+  volume =       "33",
+  pages =        "159--164",
+  year =         "1994",
+}
+
+@InProceedings{Burrows95,
+  author =       "J. H. Burrows and R. Doucet",
+  booktitle =    "Proceedings of COMADEM'95",
+  title =        "Machine Condition Monitoring Using Artificial Neural
+                 Networks to Process Vibration Data Obtained from
+                 Maintenance Monitoring Equipment",
+  address =      "Kingston, Ontario, Canada",
+  year =         "1995",
+}
+
+@Article{Byrne87,
+  author =       "J. H. Byrne",
+  title =        "Cellular analysis of associative learning",
+  journal =      "Physiological Review",
+  volume =       "67",
+  pages =        "329--439",
+  year =         "1987",
+}
+
+@InCollection{Byrne89,
+  author =       "J. H. Byrne and K. J. Gingrich and D. A. Baxter",
+  editor =       "Hawkins R. D. and Bower G. H.",
+  booktitle =    "Computational Models of Learning in Simple Neural
+                 Systems",
+  title =        "Computational capabilities of single neurons:
+                 relationship to simple forms of associative and
+                 nonassociative learning in {\it Aplysia}",
+  publisher =    "Academic Press",
+  pages =        "31--63",
+  year =         "1989",
+}
+
+@InProceedings{Cacciatore-nips94,
+  author =       "T. W. Cacciatore and Steven J. Nowlan",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Mixtures of Controllers for Jump Linear and Non-linear
+                 Plants",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1994",
+}
+
+@Article{Cai94,
+  author =       "J. Cai",
+  title =        "A Markov model of unconditional variance in {ARCH}",
+  journal =      "Journal of Business and Economic Statistics",
+  year =         "1994",
+}
+
+@inproceedings{Cai+al-2007,
+    author = {Cai, Deng   and He, Xiaofei   and Han, Jiawei  },
+    booktitle = ICCV07,
+    pages = {1--7},
+    title = {Semi-supervised Discriminant Analysis},
+    year = {2007}
+}
+
+@Article{Caianiello61,
+  author =       "E. R. Caianiello",
+  title =        "Outline of a Theory of Thought and Thinking Machines",
+  journal =      jtb,
+  volume =       "1",
+  pages =        "204--235",
+  year =         "1961",
+}
+
+@article{Campbell+Kulikowski-1966,
+    author = {F. W. Campbell and J. J. Kulikowski},
+    title = {Orientational selectivity of the human visual system},
+    journal = {Journal of Physiology},
+    year = 1966,
+    pages = "437--445",
+    address = "London"
+}
+
+@article{Campbell+al-1969,
+    title = {The Spatial Selectivity of the Visual Cells of the Cat},
+    author = {F. W. Campbell and G. F. Cooper and Enroth C. Cugell},
+    journal = {Journal of Physiology},
+    address = "London",
+    pages = {223--235},
+    volume = {203},
+    year = {1969},
+    biburl = {http://www.bibsonomy.org/bibtex/2cfcc4bc8437b72761251fb2b9e7eb106/schaul},
+    description = {idsia},
+}
+
+@InBook{CandelaJ2006,
+  author =       "J. Quiñonero Candela and C. E. Rasmussen and F. Sinz
+                 and O. Bousquet and B. Schölkopf",
+  booktitle =    "Machine learning challenges: Evaluating predictive
+                 uncertainty, visual object classification, and
+                 recognising textual entailment",
+  title =        "Evaluating Predictive Uncertainty Challenge",
+  publisher =    "Springer",
+  address =      "Heidelberg, Germany",
+  pages =        "1--27",
+  month =        apr,
+  year =         "2006",
+  series =       "Lecture Notes in Computer Science: 3944",
+  URL =          "http://www.springerlink.com/(yxluatzjo3gnpl45323wjs45)/app/home/contribution.asp?referrer=parent&amp;amp;amp;amp;amp;amp;amp;backto=issue,1,25;journal,2,3638;linkingpublicationresults,1:105633,1",
+  abstract =     "This Chapter presents the PASCAL1 Evaluating
+                 Predictive Uncertainty Challenge, introduces the
+                 contributed Chapters by the participants who obtained
+                 outstanding results, and provides a discussion with
+                 some lessons to be learnt. The Challenge was set up to
+                 evaluate the ability of Machine Learning algorithms to
+                 provide good Èprobabilistic predictionsÉ, rather than
+                 just the usual Èpoint predictionsÉ with no measure of
+                 uncertainty, in regression and classification problems.
+                 Parti-cipants had to compete on a number of regression
+                 and classification tasks, and were evaluated by both
+                 traditional losses that only take into account point
+                 predictions and losses we proposed that evaluate the
+                 quality of the probabilistic predictions.",
+  OPTeditor =    "Quiñonero Candela, J., I. Dagan, B. Magnini, F. DAlché
+                 Buc",
+}
+
+@article{candeswakin08,
+author = "Candes, E. and Wakin, M.",
+title = "An introduction to compressive sampling",
+journal = "IEEE Signal Processing Magazine",
+volume = 21,
+year = 2008,
+}
+
+@article{Candes+Tao-2005,
+ author = {E.J. Candes and T. Tao},
+ title = {Decoding by linear programming},
+ journal = {{IEEE} Transactions on Information Theory},
+ volume = 51,
+ number = 12,
+ pages = {4203--4215},
+ year = 2005,
+}
+
+@Article{Canning88,
+  author =       "A. Canning and E. Gardner",
+  title =        "Partially Connected Models of Neural Networks",
+  journal =      jpa,
+  volume =       "21",
+  pages =        "3275--3284",
+  year =         "1988",
+}
+
+@article{carandini:1994,
+    author = {Matteo Carandini and David J. Heeger},
+    title = {Summation and Division by Neurons in Primate Visual Cortex},
+    journal = {Science},
+    volume={264},
+    number={5163},
+    month = {May},
+    year = {1994},
+    pages = {1333-1336},
+}
+
+@inproceedings{Cardie-1993,
+    author = "Claire Cardie",
+    title = "Using Decision Trees to Improve Case--Based Learning",
+    booktitle = "Proceedings of the Tenth International Conference on Machine Learning",
+    publisher = "Morgan Kaufmann",
+    pages = "25--32",
+    year = "1993",
+    url = "citeseer.ist.psu.edu/cardie93using.html"
+}
+
+@Article{Carpenter87a,
+  author =       "G. A. Carpenter and S. Grossberg",
+  title =        "A Massively Parallel Architecture for a
+                 Self-Organizing Neural Pattern Recognition Machine",
+  journal =      cvgip,
+  volume =       "37",
+  pages =        "54--115",
+  year =         "1987",
+}
+
+@Article{Carpenter87b,
+  author =       "G. A. Carpenter and S. Grossberg",
+  title =        "{ART2}: Self-Organization of Stable Category
+                 Recognition Codes for Analog Input Patterns",
+  journal =      applopt,
+  volume =       "26",
+  pages =        "4919--4930",
+  year =         "1987",
+}
+
+@Article{Carpenter88,
+  author =       "G. A. Carpenter and S. Grossberg",
+  title =        "The {ART} of Adaptive Pattern Recognition by a
+                 Self-Organizing Neural Network",
+  journal =      computer,
+  pages =        "77--88",
+  month =        mar,
+  year =         "1988",
+}
+
+@InProceedings{Carrasco94,
+  author =       "R. C. Carrasco and J. Oncina",
+  booktitle =    "Grammatical Inference and Applications Proc. of the
+                 2nd International Colloquium on Grammatical Inference
+                 ICGI94",
+  title =        "Learning regular grammars by means of a state merging
+                 method",
+  publisher =    "Lecture Notes in Artificial Intelligence 862",
+  address =      "Alicante (Spain)",
+  month =        sep,
+  year =         "1994",
+}
+
+@Article{Carter94,
+  author =       "C. K. Carter and R. Kohn",
+  title =        "On Gibbs sampling for state space models",
+  journal =      "Biometrika",
+  volume =       "81",
+  pages =        "541--553",
+  year =         "1994",
+}
+
+@InProceedings{Caruana-2001,
+  author =       "Rich Caruana",
+  booktitle =    aistats01,
+  title =        "A Non-Parametric {EM}-Style Algorithm for Imputing
+                 Missing Values",
+  publisher =    "Society for Artificial Intelligence and Statistics",
+  year =         "2001",
+}
+
+@InProceedings{caruana06:empirical,
+  author =       "R. Caruana and A. Niculescu-Mizil",
+  booktitle =    ICML06,
+  editor =       ICML06ed,
+  publisher =    ICML06publ,
+  title =        "An Empirical Comparison of Supervised Learning
+                 Algorithms",
+  year =         "2006",
+}
+
+@InProceedings{caruana93a,
+  author =       "Rich Caruana",
+  booktitle =    "Proceedings of the 1993 Connectionist Models Summer
+                 School",
+  title =        "Multitask Connectionist Learning",
+  pages =        "372--379",
+  year =         "1993",
+}
+
+@InProceedings{caruana93a-small,
+  author =       "Rich Caruana",
+  booktitle =    "Proceedings of the 1993 Connectionist Models Summer
+                 School",
+  title =        "Multitask Connectionist Learning",
+  pages =        "372--379",
+  year =         "1993",
+}
+  %url =          "http://citeseer.ist.psu.edu/32984.html",
+
+@InProceedings{caruana95,
+  author =       "Rich Caruana",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        "Learning Many Related Tasks at the Same Time With
+                 Backpropagation",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "657--664",
+  year =         "1995",
+}
+
+@InProceedings{caruana96,
+  author =       "Rich Caruana and Shumeet Baluja and Tom Mitchell",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Using the Future to ``Sort Out'' the Present: Rankprop
+                 and Multitask Learning for Medical Risk Evaluation",
+  publisher =    "",
+  address =      "",
+  pages =        "",
+  year =         "1996",
+}
+
+@InProceedings{caruana96c,
+  author =       "Rich Caruana",
+  booktitle =    "International Conference on Machine Learning",
+  title =        "Algorithms and Applications for Multitask Learning",
+  pages =        "87--95",
+  year =         "1996",
+}
+
+@Article{caruana97a,
+  author =       "Rich Caruana",
+  title =        "Multitask Learning",
+  journal =      "Machine Learning",
+  volume =       "28",
+  number =       "1",
+  publisher =    "Kluwer Academic Publishers",
+  address =      "Hingham, MA, USA",
+  pages =        "41--75",
+  year =         "1997",
+}
+
+@Article{Casdagli89,
+  author =       "M. Casdagli",
+  title =        "Nonlinear Prediction of Chaotic Time Series",
+  journal =      physicaD,
+  volume =       "35",
+  pages =        "335--356",
+  year =         "1989",
+}
+
+@book{Casella+Berger-2001,
+ author = {George Casella and Roger Berger},
+ title = {Statistical Inference},
+ publisher = {Duxbury Press},
+ year = 2001,
+}
+
+
+@Article{Cashman+Pouliot90,
+  author =       "N. R. Cashman and Y. Pouliot",
+  title =        "{EBV} {Ig}-like domains",
+  journal =      "Nature",
+  volume =       "343",
+  pages =        "319",
+  year =         "1990",
+}
+
+@ARTICLE{CataltepeZ1999,
+    author = {Zehra Cataltepe and Yaser S. Abu-mostafa and Malik Magdon-ismail},
+    title = {No free lunch for early stopping},
+    journal = {Neural Computation},
+    year = {1999},
+    volume = {11},
+    pages = {995--1009}
+}
+
+@InProceedings{Cater87,
+  author =       "J. P. Cater",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Successfully Using Peak Learning Rates of 10 (and
+                 Greater) in Back-Propagation Networks with the
+                 Heuristic Learning Algorithm",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "645--651",
+  year =         "1987",
+}
+
+@Book{Caudill89,
+  author =       "M. Caudill",
+  title =        "Neural Networks Primer",
+  publisher =    "Miller Freeman",
+  address =      "San Francisco",
+  year =         "1989",
+}
+
+@Manual{CC01a,
+  author =       "Chih-Chung Chang and Chih-Jen Lin",
+  title =        "{LIBSVM}: a library for support vector machines",
+  year =         "2001",
+  note =         "Software available at
+                 \verb+http://www.csie.ntu.edu.tw/~cjlin/libsvm+",
+}
+
+@Article{cemgil+kappen+barber-2006,
+  author =       "A. T. Cemgil and H. J. Kappen and D. Barber",
+  title =        "A Generative Model for Music Transcription",
+  journal =      "IEEE Transactions on Audio, Speech and Language
+                 Processing",
+  volume =       "14",
+  number =       "2",
+  pages =        "679--694",
+  year =         "2006",
+}
+
+@inproceedings{Cevikalp+al-2008,
+    title = {Semi-Supervised Dimensionality Reduction Using Pairwise Equivalence Constraints},
+    author = {Hakan Cevikalp and Jakob J. Verbeek and Frédéric Jurie and Alexander Kläser},
+    booktitle = {VISAPP},
+    editor = {Alpesh Ranchordas and Helder Araújo},
+    pages = {489-496},
+    publisher = {INSTICC - Institute for Systems and Technologies of Information, Control and Communication},
+    url = {http://dblp.uni-trier.de/db/conf/visapp/visapp2008-1.html#CevikalpVJK08},
+    year = {2008},
+    biburl = {http://www.bibsonomy.org/bibtex/21afc498c02543e97ff5bd4f6b107e16e/dblp},
+    description = {dblp},
+    isbn = {978-989-8111-21-0},
+    date = {2008-04-07},
+    keywords = {dblp }
+}
+
+@InProceedings{CGY96,
+    author =       "Ingemar J. Cox and Joumana Ghosn and Peter N.
+                 Yianilos",
+  booktitle =    cvpr96,
+  title =        "Feature-Based Face Recognition Using
+                 Mixture-Distance",
+  pages =        "209--216",
+  year =         "1996",
+}
+
+@Article{CHAID-BVS-91,
+  author =       "D. Biggs and B. Ville and E. Suen",
+  title =        "A method of choosing multiway partitions for
+                 classification and decision trees",
+  journal =      "Journal of Applied Statistics",
+  volume =       "18",
+  number =       "1",
+  pages =        "49--62",
+  year =         "1991",
+}
+
+@InBook{CHAID-HK-82,
+  author =       "D. M. Hawkins and G. V. Kass",
+  booktitle =    "Topics in Applied Multivariate Analysis",
+  title =        "Automatic Interaction Detection",
+  publisher =    "Cambridge, Cambridge University Press",
+  pages =        "269--302",
+  year =         "1982",
+}
+
+@Article{CHAID-original-80,
+  author =       "G. V. Kass",
+  title =        "An Exploratory Technique for Investigating Large
+                 Quantities of Categorical Data",
+  journal =      "Applied Statistics",
+  volume =       "29",
+  number =       "2",
+  pages =        "119--127",
+  year =         "1980",
+}
+
+@InProceedings{Chapados2002,
+  author =       "N. Chapados and Y. Bengio and P. Vincent and J. Ghosn
+                 and C. Dugas and I. Takeuchi and L. Meng",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Estimating Car Insurance Premia: a Case Study in
+                 High-Dimensional Data Inference",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =        "1369--1376",
+  year =         "2002",
+}
+
+@InProceedings{Chapados2002-short,
+  author =       "N. Chapados and Y. Bengio and P. Vincent and J. Ghosn
+                 and C. Dugas and I. Takeuchi and L. Meng",
+  booktitle =    NIPS14,
+  title =        "Estimating Car Insurance Premia: a Case Study in
+                 High-Dimensional Data Inference",
+  publisher =    "{MIT} Press",
+  year =         "2002",
+}
+
+@InProceedings{Chapelle+al-2003,
+  author =       "O. Chapelle and J. Weston and B. Sch{\"o}lkopf",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Cluster kernels for semi-supervised learning",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =         {585--592},
+  year =         "2003",
+}
+
+@InProceedings{Chapelle-nips2003,
+  author =       "O. Chapelle and B. Sch{\"o}lkopf and J. Weston",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Semi-supervised learning through principal directions
+                 estimation",
+  publisher =    "{MIT} Press",
+  year =         "2003",
+}
+
+@InProceedings{Chapelle2001,
+  author =       "Olivier Chapelle and Jason Weston and L\'eon Bottou
+                 and Vladimir Vapnik",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "Vicinal Risk Minimization",
+  pages =        "416--422",
+  year =         "2001",
+}
+
+@InProceedings{chapelle2001iin,
+  author =       "O. Chapelle and B. Scholkopf",
+  title =        "{Incorporating invariances in nonlinear support vector
+                 machines}",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  volume =       "14",
+  year =         "2001",
+}
+
+@Article{Chapelle99,
+  author =       "O. Chapelle and P. Haffner and V. Vapnik",
+  title =        "{SVM}s for Histogram-Based Image Classification",
+  journal =      "IEEE Transactions on Neural Networks",
+  year =         "1999",
+  note =         "accepted, special issue on Support Vectors",
+}
+
+@Article{ChapelleVapnikBengio2001,
+  author =       "O. Chapelle and V. Vapnik and Y. Bengio",
+  title =        "Model Selection for Small-Sample Regression",
+  journal =      "Machine Learning Journal",
+  volume =       "48",
+  number =       "1",
+  pages =        "9--23",
+  year =         "2002",
+}
+
+@inproceedings{Willski-2002,
+  author =       "A.S. Willsky",
+  title =        "Multiresolution {Markov} models for signal and image processing",
+  booktitle =    "Proceedings of the IEEE",
+  volume =       "90",
+  number =       "8",
+  pages =        "1396--1458",
+  year =         "2002",
+}
+
+@Article{Felzenszwalb+Huttenlocher-2004,
+  author =       "Pedro F. Felzenszwalb and Daniel P. Huttenlocher",
+  title =        "Efficient Graph-Based Image Segmentation",
+  journal =      "Intl. Journal of Computer Vision",
+  volume =       "59",
+  number =       "2",
+  pages =        "167-181",
+  year =         "2004",
+}
+
+@inproceedings{Lombaert-2005,
+  author =       "Herve Lombaert and Yiyong Sun and Leo Grady and Chenyang Xu",
+  title =        "A Multilevel Banded Graph Cuts Method for Fast Image Segmentation",
+  booktitle =    ICCV05,
+  volume =       "1",
+  pages =        "259-265",
+  year =         "2005",
+}
+
+@Article{Boykov+Kolmogorov-2004,
+  author =       "Y. Boykov and V. Kolmogorov",
+  title =        "An experimental comparison of min-cut/max-flow algorithms for energy minimization in vision",
+  journal =      ieeetpami,
+  volume =       "26",
+  number =       "9",
+  pages =        "1124-1137",
+  year =         "2004",
+}
+
+@inproceedings{chapelleetal06,
+author = "Chapelle, O. and Chi, M. and Zien, A.",
+title = "A continuation method for semi-supervised {SVMs}",
+booktitle = ICML06,
+editor =    ICML06ed,
+publisher = ICML06publ,
+year = 2006,
+}
+
+@inproceedings{ChapelleO2005,
+   author = {Olivier Chapelle and Alexander Zien},
+   title = {Semi-Supervised Classification by Low Density Separation},
+   year = {2005},
+   pages = {57-64},
+   month = {01},
+   journal = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS 2005)},
+   editor = {Cowell, R. , Z. Ghahramani},
+   booktitle = {Tenth International Workshop on Artificial Intelligence and Statistics},
+   location = {Barbados},
+}
+   %URL = {http://www.gatsby.ucl.ac.uk/aistats/aistats2005_eproc.pdf}
+
+@book{Chapelle-2006,
+ author = {Olivier Chapelle and Bernhard Sch{\"o}lkopf and Alexander Zien},
+ title =    "Semi-Supervised Learning",
+ publisher =    "{MIT} Press",
+ year =         "2006",
+}
+
+@TechReport{Charniak99,
+  author =       "Eugene Charniak",
+  title =        "A Maximum-Entropy-Inspired Parser",
+  number =       "CS-99-12",
+  institution =  "Brown University",
+  year =         "1999",
+  URL =          "citeseer.nj.nec.com/charniak99maximumentropyinspired.html",
+}
+
+@misc{Chatpatanasiri-2008,
+    author = {Ratthachat Chatpatanasiri},
+    title = {Spectral Methods for Linear and Non-Linear Semi-Supervised Dimensionality Reduction},
+    url = {http://www.citebase.org/abstract?id=oai:arXiv.org:0804.0924},
+    year = {2008},
+    note = {Submitted for publication},
+}
+
+@InProceedings{Chauvin89,
+  author =       "Y. Chauvin",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "A Back-Propagation Algorithm with Optimal Use of
+                 Hidden Units",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "519--526",
+  year =         "1989",
+}
+
+@InProceedings{Chauvin90,
+  author =       "Y. Chauvin",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Dynamic behavior of constrained back-propagation
+                 networks",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "642--649",
+  year =         "1990",
+}
+
+@InProceedings{Cheeseman88,
+  author =       "P. Cheeseman and J. Kelly and M. Self and J. Stutz and
+                 W. Taylor and D. Freeman",
+  booktitle =    "Proceedings of the Fifth International Conference on
+                 Machine Learning",
+  title =        "{AutoClass}: {A} {Bayesian} Classification System",
+  address =      "The University of Michigan, Ann Arbor",
+  month =        jun,
+  year =         "1988",
+}
+
+@Article{Chelba-Jelinek-2000,
+  author =       "Ciprian Chelba and Frederick Jelinek",
+  title =        "Structured Language Modeling",
+  journal =      "Computer, Speech and Language",
+  volume =       "14",
+  number =       "4",
+  pages =        "282--332",
+  year =         "2000",
+}
+
+@Article{Chen+Goodman99,
+  author =       "Stanley F. Chen and Joshua T. Goodman.",
+  title =        "An Empirical Study of Smoothing Techniques for
+                 Language Modeling",
+  journal =      "Computer, Speech and Language",
+  volume =       "13",
+  number =       "4",
+  pages =        "359--393",
+  year =         "1999",
+}
+
+@Article{Chen+Murray2003,
+  author =       "Hsin Chen and Alan F. Murray",
+  title =        "A Continuous Restricted {Boltzmann} Machine with an
+                 Implementable Training Algorithm",
+  journal =      "IEE Proceedings of Vision, Image and Signal
+                 Processing",
+  volume =       "150",
+  number =       "3",
+  pages =        "153--158",
+  year =         "2003",
+}
+
+@PhdThesis{chen95basispursuit,
+  author =       "S. Chen",
+  title =        "Basis Pursuit",
+  school =       "Department of Statistics, Stanford University",
+  year =         "1995",
+}
+
+@TechReport{Chen98,
+  author =       "Stanley F. Chen and Joshua T. Goodman.",
+  title =        "An Empirical Study of Smoothing Techniques for
+                 Language Modeling",
+  number =       "TR-10-98",
+  institution =  "Computer Science Group, Harvard University",
+  year =         "1998",
+}
+
+@Article{ChenS2000,
+  author =       "Stanley F. Chen and Ronald Rosenfeld",
+  title =        "A Survey of Smoothing Techniques fo {ME} Models",
+  journal =      "IEEE Transactions on Speech and Audio Processing",
+  volume =       "8",
+  number =       "1",
+  month =        jan,
+  year =         "2000",
+}
+
+@techreport{Chen+Kotani-2005,
+  author =      "Chen, Fan and Kotani, Kazunori",
+  title =       "Facial Expression Recognition by Supervised {ICA} with Selective Prior",
+  ISSN =        "09135685",
+  institution = "The Institute of Electronics, Information and Communication Engineers",
+  year =        "2005",
+  number =      "462",
+  pages =       "27-32",
+  URL =         "http://ci.nii.ac.jp/naid/110004064718/en/",
+}
+
+@Article{ChenX1989,
+  author={Chen, X. R. and Krishnaiah, P. R. and Liang, W. W.},
+  title={Estimation of multivariate binary density using orthogonal functions},
+  journal={Journal of Multivariate Analysis},
+  year=1989,
+  volume={31},
+  number={2},
+  pages={178-186},
+  month={November},
+}
+
+@InProceedings{Chigier88,
+  author =       "B. Chigier and R. A. Brennan",
+  booktitle =    icassp,
+  title =        "Broad Class Network Generation Using a Combination of
+                 Rules and Statistics for Speaker Independent Continuous
+                 Speech",
+  address =      "New York, NY",
+  pages =        "449--452",
+  year =         "1988",
+}
+
+@InCollection{Chipman-NIPS2006,
+  author =       "H. A. Chipman and E. I. George and R. E. McCulloch",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Bayesian Ensemble Learning",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2007",
+}
+
+@article{Chipman-2008,
+  author =       "H. A. Chipman and E. I. George and R. E. McCulloch",
+  title =        "Bayesian Ensemble Learning",
+  journal = "Annals of Applied Statistics",
+  year =         "2008",
+  editors =      "under revision",
+}
+
+@InProceedings{ChopraS2005,
+  author =       "Sumit Chopra and Raia Hadsell and Yann {LeCun}",
+  booktitle =    cvpr05,
+  title =        "Learning a Similarity Metric Discriminatively, with
+                 Application to Face Verification",
+  publisher =    "IEEE Press",
+  year =         "2005",
+  original =     "orig/chopra-05.ps.gz",
+}
+
+@InProceedings{Choueka-1998,
+  author =       "Y. Choueka",
+  booktitle =    "RIAO 88, User-oriented Content-based Text and Image
+                 Handling",
+  title =        "Looking for needles in a haystack",
+  volume =       "1",
+  pages =        "609--623",
+  year =         "1988",
+}
+
+@Article{Chow62,
+  author =       "C. K. Chow",
+  title =        "A recognition method using neighbor dependence",
+  journal =      "IRE Trans. Elec. Comp.",
+  volume =       "EC-11",
+  pages =        "683--690",
+  month =        oct,
+  year =         "1962",
+}
+
+@InProceedings{Chrisman92AAAI,
+  author =       "Lonnie Chrisman",
+  booktitle =    AAAI-92,
+  title =        "Reinforcement Learning with Perceptual Aliasing: The
+                 Perceptual Distinctions Approach",
+  pages =        "183--188",
+  year =         "1992",
+}
+
+@InProceedings{Chung+al-1998,
+  author =       "Yi-Ming Chung and William M. Pottenger and Bruce R.
+                 Schatz",
+  booktitle =    "DL '98: Proceedings of the third ACM conference on
+                 Digital libraries",
+  title =        "Automatic subject indexing using an associative neural
+                 network",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "59--68",
+  year =         "1998",
+  ISBN =         "0-89791-965-3",
+  location =     "Pittsburgh, Pennsylvania, United States",
+}
+
+@InProceedings{Chung-97,
+  author =       "F. Chung",
+  booktitle =    "{CBMS} Regional Conference Series",
+  title =        "Spectral graph theory",
+  volume =       "92",
+  publisher =    "American Mathematical Society",
+  year =         "1997",
+}
+
+@Article{Churchill89,
+  author =       "G. A. Churchill",
+  title =        "A stochastic model for heterogeneous {DNA} sequences",
+  journal =      "Bull. Mathematical Biology",
+  volume =       "51",
+  pages =        "79--94",
+  year =         "1989",
+}
+
+@Book{Chvatal83,
+  author =       "V. Chv\'atal",
+  title =        "Linear Programming",
+  publisher =    "W. H. Freeman",
+  address =      "",
+  year =         "1983",
+}
+
+@Article{Cleeremans89,
+  author =       "A. Cleeremans and D. Servan-Schreiber and J. L.
+                 McClelland",
+  title =        "Finite State Automata and Simple Recurrent Networks",
+  journal =      nc,
+  volume =       "1",
+  pages =        "372--381",
+  year =         "1989",
+}
+
+@InCollection{Clifford-1990,
+  author = {Peter Clifford},
+  title = {Markov random Fields in statistics}, 
+  editor = {Geoffrey Grimmett and Dominic Welsh}, 
+  booktitle = {Disorder in Physical Systems: A Volume in Honour
+of John M. Hammersley}, 
+  pages = {19--32}, 
+  publisher = {Oxford University Press}, 
+  year = 1990,
+}
+
+@Book{CLM,
+  author =       "J. Y. Campbell and A. W. Lo and A. C. MacKinlay",
+  title =        "The Econometrics of Financial Markets",
+  publisher =    "Princeton University Press",
+  address =      "Princeton",
+  year =         "1997",
+}
+
+@Book{CND04,
+  author =       "{Congr\'egation de Notre-Dame}",
+  title =        "La cuisine raisonnée",
+  publisher =    "Fides",
+  year =         "2004",
+  ISBN =         "2-7621-2083-7",
+}
+
+@InProceedings{Cloutier96,
+  author =       "J. Cloutier and E. Cosatto and S. Pigeon and F. R.
+                 Boyer and P. Y. Simard",
+  booktitle =    "Fifth International Conference on Microelectronics for
+                 Neural Networks and Fuzzy Systems",
+  title =        "{VIP}: and {FPGA}-based processor for image processing
+                 and neural networks",
+  year =         "1996",
+  note =         "submitted",
+}
+
+@Manual{CMFortran,
+  author =       "",
+  key =          "TMC",
+  title =        "{CM} Fortran. Programming Guide",
+  organization = "Thinking Machine Corporation",
+  address =      "Cambridge, MA",
+  edition =      "1.1",
+  month =        jan,
+  year =         "1991",
+}
+
+@Article{Cohen83,
+  author =       "M. A. Cohen and S. Grossberg",
+  title =        "Absolute Stability of Global Pattern Formation and
+                 Parallel Memory Storage by Competitive Neural
+                 Networks",
+  journal =      ieeesmc,
+  volume =       "13",
+  pages =        "815--826",
+  year =         "1983",
+}
+
+@Article{Cohen86,
+  author =       "M. S. Cohen",
+  title =        "Design of a New Medium for Volume Holographic
+                 Information Processing",
+  journal =      applopt,
+  volume =       "25",
+  pages =        "2228--2294",
+  year =         "1986",
+}
+
+@Article{Cohen89,
+  author =       "J. R. Cohen",
+  title =        "Application of an auditory model to speech
+                 recognition",
+  journal =      "Journal of the Acoustical Society of America",
+  volume =       "85",
+  number =       "6",
+  pages =        "2623--2629",
+  year =         "1989",
+}
+
+@PhdThesis{Cohn-PhD,
+  author =       "D. Cohn",
+  title =        "Separating Formal Bounds from Practical Performance in
+                 Learning Systems",
+  school =       "University of Washington",
+  year =         "1992",
+}
+
+@InProceedings{Cohn95,
+  author =       "David Cohn and Zoubin Ghahramani and Michael I.
+                 Jordan",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        "Active learning with statistical models",
+  publisher =    "Cambridge MA: MIT Press",
+  year =         "1995",
+  pages = {705--712}
+}
+
+@InProceedings{Cohn95-small,
+  author =       "David Cohn and Zoubin Ghahramani and Michael I.
+                 Jordan",
+  editor =       NIPS7ed,
+  booktitle =    "Advances in NIPS 7",
+  title =        "Active learning with statistical models",
+  publisher =    "Cambridge MA: MIT Press",
+  year =         "1995",
+}
+
+@InProceedings{Cohn95-short,
+  author =       "D. Cohn and Z. Ghahramani and M.I.
+                 Jordan",
+  booktitle =    "Adv. Neural Inf. Proc. Sys. 7",
+  title =        "Active learning with statistical models",
+  year =         "1995",
+  pages = {705--712}
+}
+
+@InProceedings{Cole+Hou88,
+  author =       "R. A. Cole and L. Hou",
+  booktitle =    icassp,
+  title =        "Segmentation and Broad Classification of Continuous
+                 Speech",
+  address =      "New York, NY",
+  pages =        "453--452",
+  year =         "1988",
+}
+
+@Book{Cole96,
+  author =       "R. A. Cole and J. Mariani and H. Uszkoriet and A.
+                 Zaenen and V. Zue",
+  title =        "Survey of the State of the Art in Human Language
+                 Technology",
+  publisher =    "Cambridge University Press",
+  address =      "http://www.cse.ogi.edu/CSLU/HLTsurvey/HLTsurvey.html",
+  year =         "1996",
+}
+
+@TechReport{Coleman+Wu-1994,
+  author =       "Thomas F. Coleman and Zhijun Wu",
+  title =        "Parallel continuation-based global optimization for
+                 molecular conformation and protein folding",
+  institution =  "Cornell University, Dept. of Computer Science",
+  year =         "1994",
+}
+
+@TechReport{Coleman+Wu-1994-short,
+  author =       "T.F. Coleman and Z. Wu",
+  title =        "Parallel continuation-based global optimization for
+                 molecular conformation and protein folding",
+  institution =  "Cornell University, Dept. of Computer Science",
+  year =         "1994",
+}
+
+@TechReport{Collins89,
+  author =       "S. {Collins, E. Ghosh} and C. Scofield",
+  title =        "An application of a multiple neural network learning
+                 system to emulation of mortgage underwriting
+                 judgements",
+  institution =  "Nestor Inc.",
+  address =      "Providence, RI",
+  year =         "1989",
+}
+
+@InProceedings{Collins96,
+  author =       "M. Collins",
+  booktitle =    "34th Annual Meeting of the {ACL}",
+  title =        "A new statistical parser based on bigram lexical
+                 dependencies",
+  pages =        "184--191",
+  year =         "1996",
+}
+
+@InProceedings{Collins97,
+  author =       "M. Collins",
+  booktitle =    "35th Annual Meeting of the {ACL}",
+  title =        "Three generative, lexicalized models for statistical
+                 parsing",
+  address =      "Madrid, Spain",
+  pages =        "16--23",
+  year =         "1997",
+}
+
+@PhdThesis{Collins99,
+  author =       "M. Collins",
+  title =        "Head-driven statistical models for natural language
+                 parsing",
+  school =       "University of Pennsylvania",
+  year =         "1999",
+}
+
+@InProceedings{Collobert-2006,
+  author =       "R. Collobert and F. Sinz and J. Weston and L. Bottou",
+  booktitle =    "Proceedings of the 23rd International Conference on
+                 Machine Learning",
+  title =        "Trading Convexity for Scalability",
+  pages =        "",
+  year =         "2006",
+}
+
+@PhdThesis{Collobert04,
+  author =       "R. Collobert",
+  title =        "Large Scale Machine Learning",
+  school =       "Universit\'e de Paris VI, LIP6",
+  year =         "2004",
+}
+
+@Article{Collobert2002,
+  author =       "R. Collobert and S. Bengio and Y. Bengio",
+  title =        "Parallel Mixture of {SVM}s for Very Large Scale
+                 Problems",
+  journal =      "Neural Computation",
+  volume =       "14",
+  number =       "5",
+  pages =        "1105--1114",
+  year =         "2002",
+}
+
+@InProceedings{Collobert2004,
+  author =       "Ronan Collobert and Samy Bengio",
+  booktitle =    ICML04,
+  editor =       ICML04ed,
+  publisher =    ICML04publ,
+  title =        "Links between perceptrons, {MLP}s and {SVM}s",
+  address =      "New York, NY, USA",
+  year =         "2004",
+  location =     "Banff, Alberta, Canada",
+  isbn =         "1-58113-828-5",
+  pages =        "23",
+  location =     "Banff, Alberta, Canada",
+  doi =          "http://doi.acm.org/10.1145/1015330.1015415",
+}
+
+@InProceedings{CollobertR2008,
+  author =       "Ronan Collobert and Jason Weston",
+  booktitle =    ICML08,
+  editor =       ICML08ed,
+  publisher =    ICML08publ,
+  title =        "A Unified Architecture for Natural Language
+                 Processing: Deep Neural Networks with Multitask
+                 Learning",
+  year =         "2008",
+  pages =       "160-167",
+}
+  %url =          "http://www.kyb.tuebingen.mpg.de/bs/people/weston/papers/unified\-nlp.pdf",
+
+@InProceedings{CollobertR2008-small,
+  author =       "R. Collobert and J. Weston",
+  booktitle =    "ICML 2008",
+  title =        "A Unified Architecture for Natural Language
+                 Processing: Deep Neural Networks with Multitask
+                 Learning",
+  year =         "2008",
+}
+
+@InProceedings{CollobertR2008-short,
+  author =       "R. Collobert and J. Weston",
+  booktitle =    "Int. Conf. Mach. Learn. 2008",
+  title =        "A Unified Architecture for Natural Language
+                 Processing: Deep Neural Networks with Multitask
+                 Learning",
+  pages =       "160-167",
+  year =         "2008",
+}
+
+@Article{Comon94,
+  author =       "Pierre Comon",
+  title =        "Independent component analysis - a new concept?",
+  journal =      "Signal Processing",
+  volume =       "36",
+  pages =        "287--314",
+  year =         "1994",
+}
+
+@InProceedings{ConfAI:Grove:linprog,
+  author =       "Adam J. Grove and Dale Schuurmans",
+  booktitle =    "Proceedings of the Fifteenth National Conference on
+                 Artificial Intelligence",
+  title =        "Boosting in the limit: Maximizing the margin of
+                 learned ensembles",
+  year =         "1998",
+}
+
+@InProceedings{ConfAI:Maclin:adaboost,
+  author =       "Richard Maclin and David Opitz",
+  booktitle =    "Proceedings of the Fourteenth National Conference on
+                 Artificial Intelligenc",
+  title =        "An empirical evaluation of Bagging and Boosting",
+  pages =        "546--551",
+  year =         "1997",
+}
+
+@InProceedings{ConfLT:Freund:gametheorie,
+  author =       "Yoav Freund and Robert E. Schapire",
+  booktitle =    "Proceedings of the Ninth Annual Conference on
+                 Computational Learning Theory",
+  title =        "Game theory, on-line prediction and Boosting",
+  pages =        "325--332",
+  year =         "1996",
+}
+
+@InProceedings{ConfML:Dietterich:adaboost+prun,
+  author =       "D. Margineantu and Thomas G. Dietterich",
+  booktitle =    "Machine Learning: Proceedings of Fourteenth
+                 International Conference",
+  title =        "Pruning Adaptive Boosting",
+  publisher =    "ACM",
+  pages =        "211--218",
+  year =         "1997",
+}
+
+@InProceedings{ConfML:Freund:AdaBoostCompar,
+  author =       "Yoav Freund and Robert E. Schapire",
+  booktitle =    "Machine Learning: Proceedings of Thirteenth
+                 International Conference",
+  title =        "Experiments with a new Boosting algorithm",
+  publisher =    "ACM",
+  address =      "USA",
+  pages =        "148--156",
+  year =         "1996",
+}
+
+@InProceedings{ConfML:Freund:margins,
+  author =       "Robert E. Schapire and Yoav Freund and Peter Bartlett
+                 and Wee Sun Lee",
+  booktitle =    "Machine Learning: Proceedings of Fourteenth
+                 International Conference",
+  title =        "Boosting the margin: {A} new explanation for the
+                 effectiveness of voting methods",
+  pages =        "322--330",
+  year =         "1997",
+}
+
+@InProceedings{ConfML:Quinlan:AdaBoost-C45,
+  author =       "J. Ross Quinlan",
+  booktitle =    "Machine Learning: Proceedings of the fourteenth
+                 International Conference",
+  title =        "Bagging, Boosting and {C4.5}",
+  pages =        "725--730",
+  year =         "1996",
+}
+
+@InProceedings{ConfML:Schapire:outputcodes,
+  author =       "Robert E. Schapire",
+  booktitle =    "Machine Learning: Proceedings of the Fourteenth
+                 International Conference",
+  title =        "Using output codes to boost multiclass learning
+                 problems",
+  year =         "1997",
+}
+
+@Article{Coolen88,
+  author =       "A. C. C. Coolen and C. C. A. M. Gielen",
+  title =        "Delays in Neural Networks",
+  journal =      eul,
+  volume =       "7",
+  pages =        "281--285",
+  year =         "1988",
+}
+
+@Book{cooper+meyer-1960,
+  author =       "Grosvenor Cooper And Leonard B. Meyer",
+  title =        "{The Rhythmic Structure of Music}",
+  publisher =    "The Univ. of Chicago Press",
+  address =      "Chicago",
+  year =         "1960",
+  keywords =     "describe, music",
+  origin =       "Kielian-Gilbert",
+  own =          "IU Library",
+}
+
+@InCollection{Cooper73,
+  author =       "L. N. Cooper",
+  editor =       "B. Lundqvist and S. Lundqvist",
+  booktitle =    "Collective Properties of Physical Systems",
+  title =        "A Possible Organization of Animal Memory and
+                 Learning",
+  publisher =    "Academic Press",
+  address =      "New York",
+  pages =        "252--264",
+  year =         "1973",
+}
+
+@InCollection{Cooper87,
+  author =       "C. L. Scofield and D. L. Reilly and C. Elbaum and L.
+                 N. Cooper",
+  booktitle =    "Conference on Neural Information Processing Systems -
+                 Natural and Synthetic",
+  title =        "Pattern class degeneracy in an unrestricted storage
+                 density memory",
+  publisher =    "IEEE",
+  year =         "1987",
+}
+
+@Article{Corana87,
+  author =       "A. Corana and M. Marchesi and C. Martini and S.
+                 Ridella",
+  title =        "Minimizing Multimodal Functions of Continuous
+                 Variables with the Simulated Annealing Algorithm",
+  journal =      acmtms,
+  volume =       "13",
+  number =       "13",
+  pages =        "262--280",
+  month =        sep,
+  year =         "1987",
+  OPTnote =      "",
+}
+
+@Article{Corana87a,
+  author =       "A. Corana and M. Marchesi and C. Martini and S.
+                 Ridella",
+  title =        "Minimizing Multimodal Functions of Continuous
+                 Variables with the Simulated Annealing Algorithm",
+  journal =      acmtms,
+  volume =       "13",
+  number =       "13",
+  pages =        "262--280",
+  month =        sep,
+  year =         "1987",
+}
+
+@Article{Cortes04,
+  author =       "C. Cortes and P. Haffner and M. Mohri",
+  title =        "Rational Kernels: Theory and Algorithms",
+  journal =      jmlr,
+  volume =       "5",
+  pages =        "1035--1062",
+  year =         "2004",
+  OPTnumber =    "",
+}
+
+@Article{Cortes87,
+  author =       "C. Cortes and A. Krogh and J. A. Hertz",
+  title =        "Hierarchical Associative Networks",
+  journal =      jpa,
+  volume =       "20",
+  pages =        "4449--4455",
+  year =         "1987",
+}
+
+@InProceedings{Cortes89,
+  author =       "C. Cortes and J. A. Hertz",
+  booktitle =    ijcnn,
+  title =        "A Network System for Image Segmentation",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "121--127",
+  year =         "1989",
+}
+
+@Article{Cortes95,
+  author =       "Corinna Cortes and Vladimir Vapnik",
+  title =        "Support Vector Networks",
+  journal =      "Machine Learning",
+  volume =       "20",
+  pages =        "273--297",
+  year =         "1995",
+}
+
+@InProceedings{Cortesetal95a,
+  author =       "C. Cortes and H. Drucker and D. Hoover and V. Vapnik",
+  booktitle =    "Proc. 1st Intl. Conf. on Knowledge Discovery and Data
+                 Mining",
+  title =        "Capacity and Complexity Control in Predicting the
+                 Spread Between Borrowing and Lending Interest Rates",
+  address =      "Montreal (Canada)",
+  pages =        "51--56",
+  year =         "1995",
+}
+
+@InProceedings{Cortesetal95b,
+  author =       "C. Cortes and L. D. Jackel and W. P. Chiang",
+  booktitle =    "Proc. 1st Intl. Conf. on Knowledge Discovery and Data
+                 Mining",
+  title =        "Limits on Learning Machine Accuracy Imposed by Data
+                 Quality",
+  address =      "Montreal (Canada)",
+  pages =        "57--62",
+  year =         "1995",
+}
+
+@InProceedings{Cosi-92,
+  author =       "P. Cosi and P. Frasconi and M. Gori and N. Griggio",
+  booktitle =    "Proc. of the International Conference on Spoken
+                 Language",
+  title =        "Phonetic Recognition Experiments with Recurrent Neural
+                 Networks",
+  address =      "Banff (Canada)",
+  pages =        "1335--1338",
+  month =        oct,
+  year =         "1992",
+}
+
+@InProceedings{Cosnard+al-1991,
+  author =       "M. Cosnard and J. C. Mignot and H. Paugam-Moisy",
+  booktitle =    "Proceedings of the Second International Specialist
+                 Seminar on the Design and Application of Parallel
+                 Digital Processors, 1991",
+  title =        "Implementations of Multilayer Neural Networks on
+                 Parallel Architectures",
+  address =      "Lisbon",
+  pages =        "43--47",
+  month =        apr,
+  year =         "1991",
+}
+
+@Article{Cosslett85,
+  author =       "S. R. Cosslett and L-F. Lee",
+  title =        "Serial correlation in discrete variable models",
+  journal =      "Journal of Econometrics",
+  volume =       "27",
+  pages =        "79--97",
+  year =         "1985",
+}
+
+@Article{Cottrell86,
+  author =       "M. Cottrell and J. C. Fort",
+  title =        "A Stochastic Model of Retinotopy: {A} Self Organizing
+                 Process",
+  journal =      biocyb,
+  volume =       "53",
+  pages =        "405--411",
+  year =         "1986",
+}
+
+@InProceedings{Cottrell87,
+  author =       "Garrison W. Cottrell and Paul Munro and David Zipser",
+  booktitle =    "Ninth Annual Conference of the Cognitive Science
+                 Society",
+  title =        "Learning Internal Representations from Gray-Scale
+                 Images: An Example of Extensional Programming",
+  publisher =    "Lawrence Erlbaum, Hillsdale",
+  address =      "Seattle 1987",
+  pages =        "462--473",
+  year =         "1987",
+}
+
+@Book{Courant51,
+  author =       "A. Courant and D. Hilbert",
+  title =        "Methods of Mathematical Physics",
+  publisher =    "Wiley Interscience, New York",
+  year =         "1951",
+}
+
+@Article{Cover65,
+  author =       "T. M. Cover",
+  title =        "Geometrical and Statistical Properties of Systems of
+                 Linear Inequalities with Applications in Pattern
+                 Recognition",
+  journal =      ieeetec,
+  volume =       "14",
+  pages =        "326--334",
+  year =         "1965",
+}
+
+@Article{CoverHart67,
+  author =       "T. M. Cover and P. E. Hart",
+  title =        "Nearest Neighbor Pattern Classification",
+  journal =      "IEEE Transactions on Information Theory",
+  volume =       "13",
+  number =       "1",
+  pages =        "21--27",
+  year =         "1967",
+}
+
+@Article{Cowan88a,
+  author =       "J. D. Cowan and D. H. Sharp",
+  title =        "Neural Nets and Artificial Intelligence",
+  journal =      daed,
+  volume =       "117",
+  pages =        "85--121",
+  year =         "1988",
+}
+
+@Article{Cowan88b,
+  author =       "J. D. Cowan and D. H. Sharp",
+  title =        "Neural Nets",
+  journal =      qrb,
+  volume =       "21",
+  pages =        "365--427",
+  year =         "1988",
+}
+
+@InProceedings{Cox+Bridle89,
+  author =       "S. Cox and J. S. Bridle",
+  booktitle =    "Proc. IEEE Conf. on Acoustics, Speech and Signal
+                 Processing",
+  title =        "Unsupervised speaker adaptation by probabilistic
+                 spectrum fitting",
+  organization = "British Telecom and RSRE",
+  year =         "1989",
+}
+
+@InProceedings{Cox+Bridle90,
+  author =       "S. Cox and J. S. Bridle",
+  booktitle =    "Proc. IEEE Conf. on Acoustics, Speech and Signal
+                 Processing",
+  title =        "Simultaneous Speaker Normalisation and Utterance
+                 labelling Using {Bayesian}/Neural Net Techniques",
+  organization = "British Telecom and RSRE",
+  year =         "1990",
+}
+
+@Book{CoxCox94,
+  author =       "Trevor F. Cox and Micheal {A. A}. Cox",
+  title =        "Multidimensional Scaling",
+  publisher =    "Chapman \& Hall",
+  address =      "London",
+  year =         "1994",
+}
+
+@Book{Cox+Cox-2000,
+  author =       "T. Cox and M. Cox",
+  title =        "Multidimensional Scaling",
+  publisher =    "Chapman \& Hall",
+  edition =      2,
+  address =      "London",
+  year =         "2000",
+}
+
+@InProceedings{Cozman2003,
+  author =       "F. Cozman and I. Cohen and M. Cirelo",
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  title =        "Semi-Supervised Learning of Mixture Models",
+  year =         "2003",
+}
+
+@Article{Cragg54,
+  author =       "B. G. Cragg and H. N. V. Temperley",
+  title =        "The Organization of Neurones: {A} Cooperative
+                 Analogy",
+  journal =      EEGCN,
+  volume =       "6",
+  pages =        "85--92",
+  year =         "1954",
+}
+
+@Article{Cragg55,
+  author =       "B. G. Cragg and H. N. V. Temperley",
+  title =        "Memory: The Analogy with Ferromagnetic Hysteresis",
+  journal =      brain,
+  volume =       "78 II",
+  pages =        "304--316",
+  year =         "1955",
+}
+
+@Article{Craven+Wahba79,
+  author =       "P. Craven and G. Wahba",
+  title =        "Smoothing noisy data with spline functions",
+  journal =      "Numerical Mathematics",
+  volume =       "31",
+  pages =        "377--403",
+  year =         "1979",
+}
+
+@Article{Crick89,
+  author =       "F. Crick",
+  title =        "The Recent Excitement About Neural Networks",
+  journal =      nature,
+  volume =       "337",
+  pages =        "129--132",
+  year =         "1989",
+}
+
+@Article{Crisanti86,
+  author =       "A. Crisanti and D. J. Amit and H. Gutfreund",
+  title =        "Saturation Level of the Hopfield Model for Neural
+                 Network",
+  journal =      eul,
+  volume =       "2",
+  pages =        "337--341",
+  year =         "1986",
+}
+
+@Article{Crisanti87,
+  author =       "A. Crisanti and H. Sompolinsky",
+  title =        "Dynamics of Spin Systems with Randomly Asymmetric
+                 Bonds: Langevin Dynamics and a Spherical Model",
+  journal =      prA,
+  volume =       "36",
+  pages =        "4922--4939",
+  year =         "1987",
+}
+
+@Book{Cristianini+Shawe-Taylor-2000,
+  author =       "Nello Cristianini and John Shawe-Taylor",
+  title =        "An Introduction to Support Vector Machines and other
+                 kernel-based learning methods",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge, UK",
+  year =         "2000",
+}
+
+@InProceedings{Cristianini-2002,
+  author =       "N. Cristianini and J. Shawe-Taylor and J. Kandola",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Spectral Kernel Methods for Clustering",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+}
+
+@InProceedings{Cristianini02,
+  author =       "N. Cristianini and J. Shawe-Taylor and A. Elisseeff
+                 and J. Kandola",
+  title =        "On Kernel-Target Alignment",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  volume =       "14",
+  pages =        "367--373",
+  year =         "2002",
+}
+
+@InProceedings{Cristianini2002,
+  author =       "N. Cristianini and J. Shawe-Taylor and J. Kandola",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Spectral Kernel Methods for Clustering",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+  original =     "orig/AA16.ps",
+}
+
+@Article{cucker+grigoriev99,
+  author =       "Felipe Cucker and Dima Grigoriev",
+  title =        "Complexity Lower Bounds for Approximation Algebraic
+                 Computation Trees",
+  journal =      "Journal of Complexity",
+  volume =       "15",
+  number =       "4",
+  pages =        "499--512",
+  year =         "1999",
+}
+
+@TechReport{Cybenko88,
+  author =       "G. Cybenko",
+  title =        "Continuous Valued Neural Networks with Two Hidden
+                 Layers Are Sufficient",
+  institution =  "Department of Computer Science, Tufts University",
+  address =      "Medford, MA",
+  year =         "1988",
+}
+
+@Article{Cybenko89,
+  author =       "G. Cybenko",
+  title =        "Approximation by Superpositions of a Sigmoidal
+                 Function",
+  journal =      mcss,
+  volume =       "2",
+  pages =        "303--314",
+  year =         "1989",
+}
+
+@InProceedings{Dahmen2000,
+  author =       "J. Dahmen and D. Keysers and M. Pitz and H. Ney",
+  booktitle =    "22nd Symposium of the German Association for Pattern
+                 Recognition",
+  title =        "Structured covariance matrices for statistical image
+                 object recognition",
+  address =      "Kiel, Germany",
+  year =         "2000",
+}
+
+@InProceedings{Dai95,
+  author =       "H. Dai and J. M. Lina and B. Goulard and J. W. Thomson
+                 and C. K. Scott",
+  booktitle =    "1995 Robotic and Knowledge Based Sytems Workshop",
+  title =        "An Expert Diagnostic System Introducing Wavelets
+                 Analysis and Neural Network",
+  address =      "St. Hubert, Canada",
+  pages =        "",
+  year =         "1995",
+}
+
+@InProceedings{darken-moody91,
+  author =       "Christian Darken and John Moody",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "Note on learning rate schedules for stochastic
+                 optimization",
+  publisher =    "Morgan Kaufmann, Palo Alto",
+  address =      "Denver, CO",
+  pages =        "832--838",
+  year =         "1991",
+}
+
+@Article{DarrochJ1972,
+  author =       "J. N. Darroch and D. Ratcliff",
+  title =        "Generalized iterative scaling for log-linear models",
+  journal =      "Annals of Mathematical Statistics",
+  number =       "43",
+  pages =        "1470--1480",
+  year =         "1972",
+}
+
+@InProceedings{Das-nips93,
+  author =       "S. Das and C. L. Giles and G. Z. Sun",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Using Prior Knowledge in an {NNPDA} to Learn
+                 Context-Free Languages",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  year =         "1993",
+}
+
+@InProceedings{Das-nips94,
+  author =       "S. Das and M. C. Mozer",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "A Unified Gradient-Descent/Clustering Architecture for
+                 Finite State Machine Induction",
+  publisher =    "Morgan Kaufmann",
+  year =         "1994",
+}
+
+@Article{daubechies90,
+  author =       "Ingrid Daubechies",
+  title =        "The Wavelet Transform, Time-Frequency Localization and
+                 Signal Analysis",
+  journal =      "IEEE Transaction on Information Theory",
+  volume =       "36",
+  number =       "5",
+  pages =        "961--1005",
+  month =        sep,
+  year =         "1990",
+}
+
+@article{daume09searn,
+  author =       {Hal {Daum\'e III} and John Langford and Daniel Marcu},
+  title =        {Search-based Structured Prediction},
+  year =         {2009},
+  booktitle =    {Machine Learning Journal},
+}
+
+@InProceedings{Davis89,
+  author =       "L. Davis",
+  editor =       "J. D. Schaffer",
+  booktitle =    "Proceedings of the Third International Conference on
+                 Genetic Algorithms",
+  title =        "Mapping neural networks into classifier systems",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Arlington 1989",
+  pages =        "375--378",
+  year =         "1989",
+}
+
+@Article{davis94adaptive,
+  author =       "G. Davis and S. Mallat and Z. Zhang",
+  title =        "Adaptive time-frequency decompositions",
+  journal =      "Optical Engineering",
+  volume =       "33",
+  number =       "7",
+  pages =        "2183--2191",
+  month =        jul,
+  year =         "1994",
+}
+
+@InProceedings{Dayan93,
+  author =       "P. Dayan and G. E. Hinton",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Feudal Reinforcement Learning",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1993",
+}
+
+@Article{Dayan95,
+  author =       "Peter Dayan and Geoffrey E. Hinton and Radford Neal and
+                 Rich Zemel",
+  title =        "The {Helmholtz} machine",
+  journal =      "Neural Computation",
+  volume =       "7",
+  pages =        "889--904",
+  year =         "1995",
+}
+
+@inproceedings{debiecristianini03,
+author = "{de Bie}, T. and Cristianini, N.",
+title = "Convex methods for transduction",
+editor = NIPS16ed,
+booktitle = NIPS16,
+year = 2003,
+}
+
+@article{debiecristianini06,
+author = "{de Bie}, T. and Cristianini, N.",
+title = "Fast {SDP} relaxations of graph cut 
+clustering, transduction, and other combinatorial problems",
+journal = jmlr,
+volume = 7,
+year = 2006,
+}
+
+
+@TechReport{deRidder+Duin-2002,
+    author =       {Dick {de Ridder} and Robert P. W. Duin},
+    title =        {Locally linear embedding for classification},
+    number =       {PH-2002-01},
+    institution =  {Pattern Recognition Group, Dept. of Imaging Science and Technology,
+        Delft University of Technology},
+    address =      {Delft, The Netherlands},
+    year =         2002,
+}
+
+@inproceedings{deRidder+al-2003,
+    author    = {Dick {de Ridder} and Olga Kouropteva and Oleg Okun and Matti Pietik{\"a}inen and Robert P. W. Duin},
+    title     = {Supervised Locally Linear Embedding},
+    booktitle = {ICANN},
+    year      = {2003},
+    pages     = {333-341},
+    ee        = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2714&spage=333},
+    bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@InProceedings{debollivier-gallinari-thiria-90,
+  author =       "M. deBollivier and P. Gallinari and S. Thiria",
+  booktitle =    "Proc. of the International Neural Network Conference
+                 90",
+  title =        "Multi-module neural networks for classification",
+  address =      "Paris",
+  pages =        "777--780",
+  year =         "1990",
+}
+
+@Article{Decoste-2002,
+  author =       "Dennis Decoste and Bernhard Sch{\"o}lkopf",
+  title =        "Training invariant support vector machines",
+  journal =      "Machine Learning",
+  volume =       "46",
+  pages =        "161--190",
+  year =         "2002",
+}
+
+@Article{Deerwester90,
+  author =       "S. Deerwester and S. T. Dumais and G. W. Furnas and T.
+                 K. Landauer and R. Harshman",
+  title =        "Indexing by latent semantic analysis",
+  journal =      "Journal of the American Society for Information
+                 Science",
+  volume =       "41",
+  number =       "6",
+  pages =        "391--407",
+  year =         "1990",
+}
+
+@Article{Dehaene87,
+  author =       "S. Dehaene and J.-P. Changeux and J.-P. Nadal",
+  title =        "Neural Networks That Learn Temporal Sequences by
+                 Selection",
+  journal =      PNAS,
+  volume =       "84",
+  pages =        "2727--2731",
+  year =         "1987",
+}
+
+@InProceedings{Delalleau+al-2005-short,
+  author =       "Olivier Delalleau and Yoshua Bengio and Nicolas {Le Roux}",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "Efficient Non-Parametric Function Induction in
+                 Semi-Supervised Learning",
+  pages =        "96--103",
+  year =         "2005",
+}
+
+@InProceedings{DeLaTorreF2006,
+  author =       "Fernando De la Torre Frade and Takeo Kanade",
+  booktitle =    "International Conference on Machine Learning",
+  title =        "Discriminative Cluster Analysis",
+  volume =       "148",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "241--248",
+  month =        jun,
+  year =         "2006",
+}
+
+@Article{Delgutte+Kiang84,
+  author =       "B. Delgutte and N. Y. S. Kiang",
+  title =        "Speech coding in the auditory nerve",
+  journal =      jasa,
+  volume =       "75",
+  number =       "3",
+  pages =        "866--907",
+  year =         "1984",
+}
+
+@Article{Delgutte80,
+  author =       "B. Delgutte",
+  title =        "Representation of speech-like sounds in the discharge
+                 patterns of auditory nerve fibers",
+  journal =      jasa,
+  volume =       "68",
+  number =       "3",
+  pages =        "843--857",
+  year =         "1980",
+}
+
+@Misc{delve,
+  author =       "C. Rasmussen and R. Neal and G. E. Hinton and D. van
+                 Camp and Z. Ghahramani and R. Kustra and R.
+                 Tibshirani",
+  title =        "The {DELVE} Manual",
+  year =         "1996",
+  note =         "{DELVE} can be found at
+                 http://www.cs.toronto.edu/\%7Edelve",
+}
+
+@InProceedings{DeMers+Cottrell93,
+  author =       "David DeMers and Garrison W. Cottrell",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Non-linear dimensionality reduction",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "580--587",
+  year =         "1993",
+}
+
+@InProceedings{Demichelis89,
+  author =       "P. DeMichelis and L. Fissore and P. Laface and G.
+                 Micca and E. Piccolo",
+  booktitle =    icassp,
+  title =        "On the Use of Neural Networks for Speaker Independent
+                 Isolated Word Recognition",
+  address =      "Glaskow (Scotland)",
+  year =         "1989",
+}
+
+@InProceedings{DeMori+Palakal85,
+  author =       "R. De Mori and M. Palakal",
+  booktitle =    "Proc. Ninth International Joint Conference on
+                 Artificial Intelligence",
+  title =        "On the use of taxonomy of time-frequency morphologies
+                 for automatic speech recognition",
+  address =      "Los Angeles, CA",
+  pages =        "877--879",
+  year =         "1985",
+}
+
+@Article{DeMori85,
+  author =       "R. De Mori and P. Laface and Y. Mong",
+  title =        "Parallel algorithms for syllable recognition in
+                 continuous speech",
+  journal =      ieeetpami,
+  volume =       "7",
+  pages =        "56--69",
+  year =         "1985",
+}
+
+@Article{DeMori87,
+  author =       "R. De Mori and L. Lam and M. Gilloux",
+  title =        "Learning and plan refinement in a knowledge-based
+                 system for automatic speech recognition",
+  journal =      ieeetpami,
+  volume =       "2",
+  pages =        "289--305",
+  year =         "1987",
+}
+
+@InCollection{DeMori96,
+  author =       "R. {De Mori} and F. Brugnara",
+  editor =       "R. A. Cole and J. Mariani and H. Uszkoriet and A.
+                 Zaenen and V. Zue",
+  booktitle =    "Survey of the State of the Art in Human Language
+                 Technology",
+  title =        "{HMM} Methods in Speech Recognition",
+  publisher =    "Cambridge University Press",
+  address =      "http://www.cse.ogi.edu/CSLU/HLTsurvey/HLTsurvey.html",
+  pages =        "24--34",
+  year =         "1996",
+}
+
+@Article{Dempster77,
+  author =       "A. P. Dempster and N. M. Laird and D. B. Rubin",
+  title =        "Maximum-likelihood from incomplete data via the {EM}
+                 algorithm",
+  journal =      "Journal of Royal Statistical Society B",
+  volume =       "39",
+  pages =        "1--38",
+  year =         "1977",
+}
+
+@InProceedings{denker-lecun-93,
+  author =       "Yann {LeCun} and John S. Denker",
+  booktitle =    "IEEE Workshop on the Physics of Computation",
+  title =        "Natural versus Universal Probability Complexity, and
+                 Entropy",
+  publisher =    "IEEE",
+  pages =        "122--127",
+  year =         "1992",
+}
+
+@InProceedings{Denker86,
+  author =       "J. Denker",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "Neural Network Refinements and Extensions",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "121--128",
+  year =         "1986",
+}
+
+@Article{Denker87,
+  author =       "J. Denker and D. Schwartz and B. Wittner and S. Solla
+                 and R. Howard and L. Jackel and J. Hopfield",
+  title =        "Large Automatic Learning, Rule Extraction, and
+                 Generalization",
+  journal =      cs,
+  volume =       "1",
+  pages =        "877--922",
+  year =         "1987",
+}
+
+@InProceedings{Denker91,
+  author =       "J. S. Denker and Y. {LeCun}",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "Transforming neural-net output levels to probability
+                 distributions",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "853--859",
+  year =         "1991",
+}
+
+@InProceedings{Denker94,
+  author =       "J. Denker and C. J. C. Burges",
+  booktitle =    "The Mathematics of Generalization: Proceedings of the
+                 SFI/CNLS Workshop on Formal Approaches to Supervised
+                 Learning",
+  title =        "Image Segmentation and Recognition",
+  publisher =    "Addison Wesley, ISBN 0-201-40985-2",
+  year =         "1994",
+}
+
+@Article{Deprit89,
+  author =       "E. Deprit",
+  title =        "Implementing Recurrent Back-Propagation on the
+                 Connection Machine",
+  journal =      "Neural Networks",
+  volume =       "2",
+  number =       "4",
+  pages =        "295--314",
+  year =         "1989",
+}
+
+@ARTICLE{Derenyi94,
+   author = {{Der{\'e}nyi}, I. and {Geszti}, T. and {Gy{\"o}rgyi}, G.},
+    title = "{Generalization in the programed teaching of a perceptron}",
+  journal = {Physical Review {E}},
+     year = 1994,
+    month = "October",
+   volume = 50,
+    pages = {3192-3200},
+      doi = {10.1103/PhysRevE.50.3192},
+   adsurl = {http://adsabs.harvard.edu/abs/1994PhRvE..50.3192D},
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+}
+
+@Article{Derrida87,
+  author =       "B. Derrida and E. Gardner and A. Zippelius",
+  title =        "An Exactly Soluble Asymmetric Neural Network Model",
+  journal =      eul,
+  volume =       "4",
+  pages =        "167--173",
+  year =         "1987",
+}
+
+@TechReport{Derthick84,
+  author =       "M. Derthick",
+  title =        "Variations on the {Boltzmann} Machine",
+  number =       "CMU--CS--84--120",
+  institution =  "Department of Computer Science, Carnegie Mellon
+                 University",
+  address =      "Pittsburgh, PA",
+  year =         "1984",
+}
+
+@inproceedings{deSaV93,
+	address = {San Francisco, CA},
+	author = {de Sa, Virginia  R. },
+        editor = NIPS5ed,
+        booktitle = NIPS5,
+	citeulike-article-id = {350518},
+	keywords = {multiview, semisupervised},
+	pages = {112--119},
+	posted-at = {2008-08-12 16:46:39},
+	priority = {2},
+	publisher = {Morgan Kaufmann Publishers},
+	title = {Learning Classification with Unlabeled Data},
+	year = {1993}
+}	
+	%url = {http://citeseer.ist.psu.edu/desa94learning.html},
+
+@InProceedings{DeSieno88,
+  author =       "D. DeSieno",
+  booktitle =    icnn,
+  title =        "Adding a Conscience to Competitive Learning",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "117--124",
+  year =         "1988",
+}
+
+@InProceedings{DeSilva+Tenenbaum-2003,
+  author =       "V. {de Silva} and J. B. Tenenbaum",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Global Versus Local Methods in Nonlinear
+                 Dimensionality Reduction",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =        "705--712",
+  year =         "2003",
+}
+
+@Book{Devaney89,
+  author =       "R. L. Devaney",
+  title =        "An Introduction to Chaotic Dynamical Systems",
+  publisher =    "Addison-Wesley",
+  year =         "1989",
+}
+
+@Article{Devereux84,
+  author =       "J. Devereux and P. Haeberli and O. Smithies",
+  title =        "A comprehensive set of sequence analysis programs for
+                 the {VAX}",
+  journal =      "Nucleic Acids Research",
+  volume =       "12",
+  pages =        "387--395",
+  year =         "1984",
+}
+
+@Book{Devijver82,
+  author =       "P. A. Devijver and J. Kittler",
+  title =        "Pattern Recognition: {A} Statistical Approach",
+  publisher =    "Prentice-Hall",
+  address =      "London",
+  year =         "1982",
+}
+
+@Article{Devijver87,
+  author =       "J. Voisin and P. A. Devijver",
+  title =        "An application of the multiedit-condensing technique
+                 to the reference selection problem in a print
+                 recognition system",
+  journal =      "Pattern Recognition",
+  volume =       "20",
+  number =       "5",
+  pages =        "465--474",
+  year =         "1987",
+}
+
+@Article{deVries92,
+  author =       "B. \mbox{de Vries} and J. C. Principe",
+  title =        "The gamma model -- {A} new neural net model for
+                 temporal processing",
+  journal =      nn,
+  volume =       "5",
+  pages =        "565--576",
+  year =         "1992",
+}
+
+@Book{Devroye-book96,
+  author =       "L. Devroye and L. Gyröfi and G. Lugosi",
+  title =        "A Probabilistic Theory of Pattern Recognition",
+  publisher =    "Springer-Verlag",
+  year =         "1996",
+}
+
+@Article{Devroye88,
+  author =       "Luc Devroye",
+  title =        "Automatic Pattern Recognition: {A} Study of the
+                 Probability of Error",
+  journal =      "IEEE Transactions on Pattern Analysis and Machine
+                 Intelligence",
+  volume =       "10",
+  number =       "4",
+  pages =        "530--543",
+  month =        jul,
+  year =         "1988",
+}
+
+@Book{Diamantras-96,
+  author =       "K. I. Diamantras and S. Y. Kung",
+  title =        "Principal Component Neural Networks: theory and applications",
+  publisher =    "Wiley",
+  year =         "1996",
+}
+
+@Article{Diebold+Mariano95,
+  author =       "F. X. Diebold and R. S. Mariano",
+  title =        "Comparing Predictive Accuracy",
+  journal =      "Journal of Business and Economic Statistics",
+  volume =       "13",
+  number =       "3",
+  pages =        "253--263",
+  year =         "1995",
+}
+
+@InCollection{Diebold93,
+  author =       "F. X. Diebold and J. H. Lee and G. C. Weinbach",
+  editor =       "C. Hargreaves",
+  booktitle =    "Nonstationary Time Series Analysis and Cointegration",
+  title =        "Regime switching with time-varying transition
+                 probabilities",
+  publisher =    "Oxford University Press",
+  address =      "Oxford",
+  year =         "1993",
+}
+
+@InCollection{Diebold93b,
+  author =       "F. X. Diebold and G. Rudebusch and E. Sichel",
+  editor =       "J. H. Stock and M. W. Watson",
+  booktitle =    "Business Cycles, Indicators, and Forecasting",
+  title =        "Further evidence on business-cycle duration
+                 dependence",
+  publisher =    "University of Chicago Press",
+  address =      "Chicago",
+  year =         "1993",
+}
+
+@Article{DieboldKilian,
+  author =       "F. X. Diebold and L. Kilian",
+  title =        "Measuring Predictability:Theory and Macroeconomics
+                 Applications",
+  journal =      "NBER technical working paper",
+  volume =       "213",
+  year =         "1997",
+}
+
+@InCollection{DieboldLopez,
+  author =       "F. X. Diebold and J. A. Lopez",
+  editor =       "G. S. Maddala and C. R. Rao",
+  booktitle =    "Handbook of Statistics, Vol. 14",
+  title =        "Forecast Evaluation and Combination",
+  publisher =    "Elsevier Science",
+  pages =        "241--268",
+  year =         "1996",
+}
+
+@Article{Diederich87,
+  author =       "S. Diederich and M. Opper",
+  title =        "Learning of Correlated Patterns in Spin-Glass Networks
+                 by Local Learning Rules",
+  journal =      prl,
+  volume =       "58",
+  pages =        "949--952",
+  year =         "1987",
+}
+
+@InProceedings{Diegert90,
+  author =       "C. Diegert",
+  booktitle =    "Proceedings of IEEE-IJCNN90",
+  title =        "Out-of-core Backpropagation",
+  volume =       "II",
+  address =      "San Diego, CA",
+  pages =        "97--103",
+  year =         "1990",
+}
+
+@Article{dietterich,
+  author =       "T. G. Dietterich",
+  title =        "Approximate Statistical Tests for Comparing Supervised
+                 Classification Learning Algorithms",
+  journal =      "Neural Computation",
+  volume =       "10",
+  number =       "7",
+  pages =        "1895--1924",
+  year =         "1998",
+}
+
+@Article{Dietterich1998,
+  author =       "Thomas G. Dietterich",
+  title =        "Approximate Statistical Test For Comparing Supervised
+                 Classification Learning Algorithms",
+  journal =      "Neural Computation",
+  volume =       "10",
+  number =       "7",
+  pages =        "1895--1923",
+  year =         "1998",
+  URL =          "citeseer.ist.psu.edu/dietterich98approximate.html",
+}
+
+@Article{dietterich97,
+  author =       "Thomas G. Dietterich and Richard H. Lathrop and Tomas
+                 Lozano-Perez",
+  title =        "Solving the Multiple Instance Problem with
+                 Axis-Parallel Rectangles",
+  journal =      "Artificial Intelligence",
+  volume =       "89",
+  number =       "1-2",
+  pages =        "31--71",
+  year =         "1997",
+}
+
+
+@Article{Diggle+Gratton-1984,
+  author =       "P. Diggle and R. Gratton",
+  title =        "Monte Carlo Methods of Inference for Implicit Statistical Models",
+  journal =      "Journal of the Royal Statistical Society. Series B (Methodological)",
+  volume =       "46",
+  number =       "2",
+  pages =        "193--227",
+  year =         "1984",
+  publisher =    "Blackwell Publishing for the Royal Statistical Society",
+}
+
+
+@InCollection{Doi-2006,
+  author =       "Eizaburo Doi and Doru C. Balcan and Michael S.
+                 Lewicki",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "A Theoretical Analysis of Robust Coding over Noisy
+                 Overcomplete Channels",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "307--314",
+  year =         "2006",
+}
+
+@InProceedings{DoiE2007,
+  author =       "Eizaburo Doi and Michael S. Lewicki",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "A Theory of Retinal Population Coding.",
+  publisher =    "MIT Press",
+  pages =        "353--360",
+  year =         "2007",
+}
+
+@book{Doidge-2007,
+    author = {Doidge, Norman},
+    howpublished = {Paperback},
+    isbn = {0143113100},
+    month = {December},
+    publisher = {Penguin Group},
+    title = {The Brain That Changes Itself: Stories of Personal Triumph from the Frontiers of Brain Science},
+    year = {2007}
+}
+
+@InCollection{DollarP2007,
+  author =       "Piotr Doll\'ar and Serge Belongie and Vincent Rabaud",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Learning to Traverse Image Manifolds",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "361--368",
+  year =         "2007",
+}
+
+@inproceedings{ DollarP2007b,
+       author = "P. Doll\'ar and V. Rabaud and S. Belongie",
+       title = "Non-Isometric Manifold Learning: Analysis and an Algorithm",
+       booktitle =    ICML07,
+       editor =       ICML07ed,
+       publisher =    ICML07publ,
+       month = "June",
+       year = "2007"
+}
+
+@TechReport{Donoho+Carrie-03,
+  author =       "D. L. Donoho and C. Grimes",
+  title =        "Hessian Eigenmaps: new locally linear embedding
+                 techniques for high-dimensional data",
+  number =       "2003-08",
+  institution =  "Dept. Statistics, Stanford University",
+  year =         "2003",
+}
+
+@article{Donoho-2006,
+ author = {David Donoho},
+ title = {Compressed sensing},
+ journal = {{IEEE} Transactions on Information Theory},
+ volume = 52,
+ number = 4,
+ pages = {1289--1306},
+ year = 2006,
+}
+
+@Book{Dorigo98,
+  author =       "M. Dorigo and M. Colombetti",
+  title =        "Robot shaping: {An} experiment in behavior
+                 engineering",
+  publisher =    "MIT Press/Bradford Books",
+  year =         "1998",
+}
+
+@book{Doucet+al-2001,
+  editor =       "A. Doucet and  N. {de Freitas} and N. Gordon",
+  title =        "Sequential Monte Carlo Methods in Practice",
+  publisher =    "Springer-Verlag",
+  year =         "2001",
+}
+
+@TechReport{Doya93bif,
+  author =       "K. Doya",
+  title =        "Bifurcations of Recurrent Neural Networks in Gradient
+                 Learning",
+  institution =  "Department of Biology, University of California",
+  address =      "La Jolla, CA",
+  year =         "1993",
+  note =         "Submitted",
+}
+
+@TechReport{Doya93un,
+  author =       "K. Doya",
+  title =        "Universality of Fully-Connected Recurrent Neural
+                 Networks",
+  institution =  "Department of Biology, University of California",
+  address =      "La Jolla, CA",
+  year =         "1993",
+  note =         "Submitted",
+}
+
+@Article{Doyle+Snell-1984,
+  author =       "Peter G. Doyle and J. Laurie Snell",
+  title =        "Random Walks and Electric Networks",
+  journal =      "Mathematical Association of America",
+  year =         "1984",
+}
+
+@Book{Draper81,
+  author =       "N. R. Draper and H. Smith",
+  title =        "Applied Regression Analysis",
+  publisher =    "John Wiley and Sons",
+  year =         "1981",
+}
+
+@InProceedings{Driancourt91,
+  author =       "X. Driancourt and L. Bottou and P. Gallinari",
+  booktitle =    ijcnn,
+  title =        "Learning Vector Quantization, Multi-Layer Perceptron
+                 and Dynamic Programming: Comparison and Cooperation",
+  volume =       "2",
+  pages =        "815--819",
+  year =         "1991",
+  OPTaddress =   "Seattle WA",
+}
+
+@InProceedings{Drucker93,
+  author =       "H. Drucker and R. Schapire and R. Simard",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Improving performance in neural networks using a
+                 boosting algorithm",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "42--49",
+  year =         "1993",
+}
+
+@Article{Drucker93b,
+  author =       "H. Drucker and R. Schapire and R. Simard",
+  title =        "Boosting performance in neural networks",
+  journal =      "International Journal of Pattern Recognition and
+                 Artificial Intelligence",
+  pages =        "61--76",
+  year =         "1993",
+  note =         "Special Issue on Applications of Neural Networks to
+                 Pattern Recognition (I. Guyon Ed.)",
+}
+
+@article{Duane-1987,
+ author = {S. Duane and A.D. Kennedy and B. Pendleton and D. Roweth},
+ title = {Hybrid {M}onte {C}arlo},
+ journal = {Phys. Lett. {B}},
+ volume = 195,
+ pages = {216--222},
+ year = 1987,
+}
+
+@Book{Duda-Hart,
+  author =       "R. O. Duda and P. E. Hart",
+  title =        "Pattern Classification and Scene Analysis",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1973",
+}
+
+@Book{Duda-Hart-2000,
+  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
+  title =        "Pattern Classification, Second Edition",
+  publisher =    "Wiley and Sons",
+  address =      "New York",
+  year =         "2001",
+}
+
+@Book{Duda73,
+  author =       "R. O. Duda and P. E. Hart",
+  title =        "Pattern Classification and Scene Analysis",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1973",
+}
+
+@Article{Dugas+al-2003,
+  author =       "C. Dugas and Y. Bengio and N. Chapados and P. Vincent
+                 and G. Denoncourt and C. Fournier",
+  title =        "Statistical Learning Algorithms Applied to Automobile
+                 Insurance Ratemaking",
+  journal =      "CAS Forum",
+  volume =       "1",
+  number =       "1",
+  pages =        "179--214",
+  month =        "Winter",
+  year =         "2003",
+}
+
+@TechReport{Dugas00,
+  author =       "C. Dugas and O. Bardou and Y. Bengio",
+  title =        "Analyses Empiriques sur des Transactions d'options",
+  number =       "1176",
+  institution =  "D\'epartment d'informatique et de Recherche
+                 Op\'erationnelle, Universit\'e de Montr\'eal",
+  address =      "Montr\'eal, Qu\'ebec, Canada",
+  year =         "2000",
+}
+
+@InProceedings{Dugas01,
+  author =       "C. Dugas and Y. Bengio and F. B\'elisle and C.
+                 Nadeau",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "Incorporating Second-Order Functional Knowledge for Better Option Pricing",
+  publisher =    "{MIT} Press",
+  pages =        "472--478",
+  year =         "2001",
+}
+
+%%InProceedings{Bengio2000,
+%%  author =       "Y. Bengio",
+%%  booktitle =    icjnn
+%%  title =        "Incorporating Second-Order Functional Knowledge for Better Option Pricing",
+%%  volume =       "V",
+%%  pages =        "79--84",
+%%  year =         "2000",
+%%}
+
+@inproceedings{Bengio2000,
+  title={Probabilistic neural network models for sequential data},
+  author={Bengio, Y.},
+  booktitle=ijcnn,
+  year={2000},
+  volume={5},
+  pages={79-84},
+  abstract={Artificial neural networks (ANN) can be incorporated into probabilistic models. In this paper we review some of the approaches which have been proposed to incorporate them into probabilistic models of sequential data, such as hidden Markov models (HMM). We also discuss new developments and new ideas in this area, in particular how ANN can be used to model high-dimensional discrete and continuous data to deal with the curse of dimensionality and how the ideas proposed in these models could be applied to statistical language modeling to represent longer-term context than allowed by trigram models, while keeping word-order information},
+  keywords={computational linguistics, hidden Markov models, neural nets, probabilityANN, HMM, hidden Markov models, longer-term context, probabilistic models, probabilistic neural network models, sequential data, statistical language modeling, trigram models, word-order information},
+  doi={10.1109/IJCNN.2000.861438},
+}
+
+@InProceedings{Bengio-hyper-2000,
+  author =       "Yoshua Bengio",
+  booktitle =    ijcnn,
+  title =        "Continuous Optimization of Hyper-Parameters",
+  volume =       "V",
+  pages =        "305--310",
+  year =         "2000",
+}
+
+@InProceedings{Ghosn2000,
+  author =       "J. Ghosn and Y. Bengio",
+  booktitle =    ijcnn,
+  title =        "Bias Learning, Knowledge Sharing",
+  volume =       "I",
+  pages =        "9--14",
+  year =         "2000",
+}
+
+@Article{Durbin87,
+  author =       "R. Durbin and D. Willshaw",
+  title =        "An Analogue Approach to the Travelling Salesman
+                 Problem Using an Elastic Net Method",
+  journal =      nature,
+  volume =       "326",
+  pages =        "689--691",
+  year =         "1987",
+}
+
+@MastersThesis{Dzwonczyk91,
+  author =       "M. Dzwonczyk",
+  title =        "Quantitative failure models of feed-forward neural
+                 networks",
+  school =       "MIT",
+  year =         "1991",
+}
+
+@Book{econometric-G-97,
+  author =       "W. H. Greene",
+  title =        "Econometric Analysis 3rd edition",
+  publisher =    "Prentice Hall, Inc.",
+  year =         "1997",
+}
+
+@Article{efficient-KW-82,
+  author =       "W. W. Krasker and R. R. Welsch",
+  title =        "Efficient Bounded-Influence Regression Estimation",
+  journal =      "J. Am. Stat. Asso.",
+  volume =       "77",
+  pages =        "595--604",
+  year =         "1982",
+}
+
+@Book{Efron+Tibs93,
+  author =       "Bradley Efron and Robert J. Tibshirani",
+  title =        "An introduction to the Bootstrap",
+  publisher =    "Chapman and Hall",
+  address =      "New York",
+  year =         "1993",
+}
+
+@TechReport{eigen-TR2,
+  author =       "Yoshua Bengio and Pascal Vincent and Jean-Fran{\cc}ois
+                 Paiement and Olivier Delalleau and Marie Ouimet and
+                 Nicolas {Le Roux}",
+  title =        "Spectral Clustering and Kernel {PCA} are Learning
+                 Eigenfunctions",
+  number =       "1239",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "2003",
+}
+
+@InProceedings{Eisner96,
+  author =       "J. Eisner",
+  booktitle =    "COLING-96",
+  title =        "Three new probabilistic models for dependency parsing:
+                 an exploration",
+  address =      "Copenhagen, Denmark",
+  pages =        "340--345",
+  year =         "1996",
+}
+
+@Article{EladAharon2006,
+  author =       "Michael Elad and Michal Aharon",
+  title =        "Image Denoising Via Sparse and Redundant
+                 Representations Over Learned Dictionaries",
+  journal =      "IEEE Transactions on Image Processing",
+  volume =       "15",
+  number =       "12",
+  pages =        "3736--3745",
+  month =        dec,
+  year =         "2006",
+  bibsource =    "http://www.visionbib.com/bibliography/image-proc131.html#TT8737",
+}
+
+@InProceedings{ElHihi+Bengio-nips8-small,
+  author =       "S. ElHihi and Y. Bengio",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Hierarchical Recurrent Neural Networks for Long-Term
+                 Dependencies",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "493--499",
+  year =         "1996",
+}
+
+@InProceedings{ellis+poliner-icassp2007,
+  author =       "D. Ellis and G. Poliner",
+  editor =       "",
+  booktitle =    "{Proceedings of the 2007 International Conference on
+                 Acoustics, Speech and Signal Processing (ICASSP)}",
+  title =        "Identifying Cover Songs with Chroma Features and
+                 Dynamic Programming",
+  publisher =    "IEEE Signal Processing Society",
+  pages =        "",
+  year =         "2007",
+}
+
+@Article{Elman88,
+  author =       "J. L. Elman and D. Zipser",
+  title =        "Learning the Hidden Structure of Speech",
+  journal =      jasa,
+  volume =       "83",
+  pages =        "1615--1626",
+  year =         "1988",
+}
+
+@Article{Elman88Jasa88,
+  author =       "J. L. Elman and D. Zipser",
+  title =        "Learning the Hidden Structure of Speech",
+  journal =      "Journal of the Acoustical Society of America",
+  volume =       "83",
+  year =         "1988",
+}
+
+@Article{Elman90,
+  author =       "J. L. Elman",
+  title =        "Finding Structure in Time",
+  journal =      "Cognitive Science",
+  volume =       "14",
+  pages =        "179--211",
+  year =         "1990",
+}
+
+@Article{Elman93,
+  author =       "Jeffrey L. Elman",
+  title =        "Learning and development in neural networks: {The}
+                 importance of starting small.",
+  journal =      "Cognition",
+  volume =       "48",
+  pages =        "781--799",
+  year =         "1993",
+  url =          "http://www3.isrl.uiuc.edu/~junwang4/langev/localcopy/pdf/elman93cognition.pdf"
+}
+
+@TechReport{ElmanTR88,
+  author =       "J. L. Elman",
+  title =        "Finding Structure in Time",
+  number =       "CRL TR 8801",
+  institution =  "Center for Research in Language, University of
+                 California at San Diego",
+  year =         "1988",
+}
+
+@TechReport{EM-tech-rep,
+  author =       "Y. Bengio and P. Frasconi",
+  title =        "Learning Sequential Behavior: an {EM} Approach",
+  institution =  "Universit\`a di Firenze",
+  year =         "1994",
+  note =         "(in preparation)",
+}
+
+@Article{Engel-Mannor-Meir-2003,
+  author =       "Y. Engel and S. Mannor and R. Meir",
+  title =        "The kernel recursive least squares algorithm",
+  journal =      "IEEE Trans. Sig. Proc.",
+  volume =       "52",
+  number =       "8",
+  pages =        "2275--2285",
+  year =         "2004",
+}
+
+@Article{erhan06qsar,
+  author =       "Dumitru Erhan and Pierre-Jean L'Heureux and Shi Yi Yue
+                 and Yoshua Bengio",
+  title =        "Collaborative Filtering on a Family of Biological
+                 Targets.",
+  journal =      "Journal of Chemical Information and Modeling",
+  volume =       "46",
+  number =       "2",
+  pages =        "626--635",
+  year =         "2006",
+}
+
+@techreport{Erhan-09-visualization-tr,
+  author = {Dumitru Erhan and Yoshua Bengio and Aaron Courville and Pascal Vincent},
+  title = "Visualizing Higher-Layer Features of a Deep Network",
+  institution = "Universit\'{e} de Montr\'{e}al",
+  number = "1341",
+  year = 2009,
+}
+
+@inproceedings{Erhan2009-small,
+ author = {Dumitru Erhan and Pierre-Antoine Manzagol and Yoshua Bengio and Samy Bengio and Pascal Vincent},
+  booktitle =    "Proceedings of AISTATS'2009",
+  title =        "The Difficulty of Training Deep Architectures and the
+Effect of Unsupervised Pre-Training",
+  year = 2009,
+}
+
+@inproceedings{Erhan2009-short,
+ author = {D. Erhan and P.-A. Manzagol and Y. Bengio and S. Bengio and P. Vincent},
+  booktitle =    "AI \& Stat.'2009",
+  title =        "The Difficulty of Training Deep Architectures and the
+Effect of Unsupervised Pre-Training",
+  year = 2009,
+}
+
+@Book{EverittB1981,
+  author = 	 {B. S. Everitt and D. J. Hand},
+  title = 	 {Finite Mixture Distributions},
+  publisher =    {Chapman and Hall},
+  address =      {London},
+  year = 	 {1981},
+  series = 	 {Monographs on Statistics and Applied Probability},
+}
+
+@InProceedings{evgeniou04,
+  author =       "Theodoros Evgeniou and Massimiliano Pontil",
+  booktitle =    "KDD '04: Proceedings of the 2004 ACM SIGKDD
+                 international conference on Knowledge discovery and
+                 data mining",
+  title =        "Regularized multi--task learning",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "109--117",
+  year =         "2004",
+  location =     "Seattle, WA, USA",
+}
+
+@Article{evgeniou05,
+  author =       "Theodoros Evgeniou and Charles A. Micchelli and
+                 Massimiliano Pontil",
+  title =        "Learning Multiple Tasks with Kernel Methods",
+  journal =      jmlr,
+  volume =       "6",
+  pages =        "615--637",
+  month =        apr,
+  year =         "2005",
+}
+
+@InProceedings{Fahlman83,
+  author =       "S. E. Fahlman and G. E. Hinton and T. J. Sejnowski",
+  booktitle =    "Proceedings of the National Conference on Artificial
+                 Intelligence AAAI-83",
+  title =        "Massively parallel architectures for {AI}: {NETL},
+                 Thistle, and {Boltzmann} machines",
+  year =         "1983",
+}
+
+@InProceedings{Fahlman89,
+  author =       "S. E. Fahlman",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "Fast-Learning Variations on Back-Propagation: An
+                 Empirical Study",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "38--51",
+  year =         "1989",
+}
+
+@InProceedings{Fahlman90,
+  author =       "Scott E. Fahlman and Christian Lebiere",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "The Cascade-Correlation Learning Architecture",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "524--532",
+  year =         "1990",
+}
+
+@InProceedings{Fahlman90-small,
+  author =       "S. E. Fahlman and C. Lebiere",
+  booktitle =    "NIPS 2",
+  title =        "The Cascade-Correlation Learning Architecture",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "524--532",
+  year =         "1990",
+}
+
+@Article{Fama+French,
+  author =       "E. F. Fama and K. R. French",
+  title =        "Permanent and Temporary Components of Stock Prices",
+  journal =      "Journal of Political Economy",
+  volume =       "96",
+  number =       "2",
+  pages =        "246--273",
+  year =         "1988",
+}
+
+@Book{Fant60,
+  author =       "G. Fant",
+  title =        "Acoustic Theory of Speech Production",
+  publisher =    "Mouton and Co.",
+  year =         "1960",
+}
+
+@Book{Fant73,
+  author =       "G. Fant",
+  title =        "Speech Sounds and Features",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1973",
+}
+
+@Article{Farhat85,
+  author =       "N. H. Farhat and D. Psaltis and A. Prata and E. Paek",
+  title =        "Optical Implementation of the Hopfield Model",
+  journal =      applopt,
+  volume =       "24",
+  year =         "1985",
+}
+
+@Article{Farhat87,
+  author =       "N. H. Farhat",
+  title =        "Optoelectronic Analogs of Self-Programming Neural
+                 Nets: Architectures and Methods for Implementing Fast
+                 Stochastic Learning by Simulated Annealing",
+  journal =      applopt,
+  volume =       "26",
+  pages =        "5093--5103",
+  year =         "1987",
+}
+
+@Article{Farmer87,
+  author =       "D. Farmer and J. Sidorowich",
+  title =        "Predicting Chaotic Time Series",
+  journal =      prl,
+  volume =       "59",
+  pages =        "845--848",
+  year =         "1987",
+}
+
+@InCollection{Farmer88,
+  author =       "D. Farmer and J. Sidorowich",
+  editor =       "W. C. Lee",
+  booktitle =    "Evolution, Learning, and Cognition",
+  title =        "Exploiting Chaos to Predict the Future and Reduce
+                 Noise",
+  publisher =    "World Scientific",
+  address =      "Singapore",
+  pages =        "277--330",
+  year =         "1988",
+}
+
+@inproceedings{Fei-Fei.2004,
+        author = {Fei-Fei, Li and Fergus, Rod and Perona, Pietro},
+        doi = {10.1109/CVPR.2004.109},
+        journal = {Computer Vision and Pattern Recognition Workshop, 2004 Conference on},
+        keywords = {categorization, computer-vision, generative-models},
+        pages = {178},
+        posted-at = {2007-08-10 12:20:22},
+        priority = {3},
+        title = {Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories},
+        url = {http://dx.doi.org/10.1109/CVPR.2004.109},
+        year = {2004}
+}
+
+@Article{Feldman82,
+  author =       "J. A. Feldman and D. H. Ballard",
+  title =        "Connectionist Models and Their Properties",
+  journal =      cogsci,
+  volume =       "6",
+  year =         "1982",
+}
+
+@Article{feldman96,
+  author =       "Jerome A. Feldman and George Lakoff and David Bailey
+                 and Srini Narayanan and Terry Regier and Andreas
+                 Stolcke",
+  title =        "{L0} - The First Five Years of an Automated Language
+                 Acquisition Project",
+  journal =      "Artificial Intelligence Review",
+  volume =       "10",
+  number =       "1-2",
+  pages =        "103--129",
+  year =         "1996",
+  URL =          "citeseer.ist.psu.edu/feldman96first.html",
+}
+
+@Book{Fellbaum1996,
+  author =       "Christine Fellbaum",
+  title =        "{WordNet}: An Electronic Lexical Database and Some of
+                 its Application",
+  publisher =    "MIT Press",
+  year =         "1996",
+}
+
+@Misc{Fellbaum1998,
+  author =       "Christiane Fellbaum Editor",
+  title =        "{WordNet}: An Electronic Lexical Database",
+  URL =          "citeseer.nj.nec.com/fellbaum98wordnet.html",
+}
+
+@Book{Feller68,
+  author =       "W. Feller",
+  title =        "An Introduction to Probability Theory and Its
+                 Applications",
+  volume =       "1",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1968",
+}
+
+@InProceedings{Feng-Statlog,
+  author =       "C. Feng and A. Sutherland and R. King and S. Muggleton
+                 and R. Henery",
+  booktitle =    "Proceedings of the Fourth International Workshop on
+                 Artificial Intelligence and Statistics",
+  title =        "Comparison of machine learning classifiers to
+                 statistics and neural networks",
+  pages =        "41--52",
+  year =         "1993",
+}
+
+@article{Field-1994,
+    author = {David J. Field},
+    title = {What is the goal of sensory coding?},
+    journal = {Neural Computation},
+    volume = {6},
+    number = {4},
+    year = {1994},
+    issn = {0899-7667},
+    pages = {559--601},
+    doi = {http://dx.doi.org/10.1162/neco.1994.6.4.559},
+    publisher = {MIT Press},
+    address = {Cambridge, MA, USA},
+}
+
+@article{Fisher-1936,
+    author = {Ronald  A. Fisher},
+    journal = {Annals of Eugenics},
+    pages = {179--188},
+    title = {The use of multiple measurements in taxonomic problems},
+    volume = {7},
+    year = {1936}
+}
+
+@Book{Fischer90,
+  author =       "K. H. Fischer and J. A. Hertz",
+  title =        "Spin Glasses",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge",
+  year =         "1990",
+}
+
+@TechReport{Fix+Hodges-51,
+  author =       "E. Fix and J. L. Hodges",
+  title =        "Discriminatory analysis, non-parametric
+                 discrimination, consistency properties",
+  number =       "Report 21-49-004",
+  institution =  "{USAF} School of Aviation Medicine, Randolph Field,
+                 Texas",
+  year =         "1951",
+}
+
+@Article{FixHodges51,
+  author =       "Evelyn Fix and Joseph L. Hodges Jr.",
+  title =        "Discriminatory Analysis: Nonparametric discrimination:
+                 Consistency properties",
+  journal =      "USAF School of Aviation Medecine",
+  volume =       "4",
+  pages =        "261--279",
+  year =         "1951",
+}
+
+@Article{FixHodges52,
+  author =       "Evelyn Fix and Joseph L. Hodges Jr.",
+  title =        "Discriminatory Analysis: Nonparametric discrimination:
+                 Small sample performance",
+  journal =      "USAF School of Aviation Medecine",
+  volume =       "11",
+  pages =        "280--322",
+  year =         "1952",
+}
+
+@MastersThesis{Flammia91,
+  author =       "G. Flammia",
+  title =        "Speaker Independent Consonant Recognition in
+                 Continuous Speech with Distinctive Phonetic Features",
+  school =       "McGill University, School of Computer Science",
+  year =         "1991",
+}
+
+@Book{Flanagan72,
+  author =       "J. L. Flanagan",
+  title =        "Speech Analysis, Synthesis, and Perception",
+  publisher =    "Springer--Verlag",
+  address =      "Berlin",
+  edition =      "2nd",
+  year =         "1972",
+}
+
+@Book{Fletcher87,
+  author =       "Roger Fletcher",
+  title =        "Practical Methods of Optimization",
+  publisher =    "Wiley",
+  address =      "New York",
+  edition =      "Second",
+  year =         "1987",
+}
+
+@InCollection{FleuretF2006,
+  author =       "Francois Fleuret and Gilles Blanchard",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Pattern Recognition from One Example by Chopping",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "371--378",
+  year =         "2006",
+}
+
+@InProceedings{Foldiak89,
+  author =       "P. F{\"o}ldi\'ak",
+  booktitle =    ijcnn,
+  title =        "Adaptive Network for Optimal Linear Feature
+                 Extraction",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "401--405",
+  year =         "1989",
+}
+
+@Article{Foldiak91,
+  author =       "P. F{\"o}ldi\'ak",
+  title =        "Learning Invariance from Transformation Sequences",
+  journal =      "Neural Computation",
+  volume =       "3",
+  number =       "2",
+  pages =        "194--200",
+  year =         "1991",
+}
+
+@TechReport{Fontaine,
+  author =       "T. Fontaine",
+  title =        "{GRAD}-{CM2}: {A} Data-parallel Connectionist Network
+                 Simulator",
+  number =       "MS-CIS-92-55/LINC LAB 232",
+  institution =  "University of Pennsylvania",
+  month =        jul,
+  year =         "1992",
+  OPTnote =      "",
+}
+
+@Article{Foster+George94,
+  author =       "D. Foster and E. George",
+  title =        "The risk inflation criterion for multiple regression",
+  journal =      "Annals of Statistics",
+  volume =       "22",
+  pages =        "1947--1975",
+  year =         "1994",
+}
+
+@PhdThesis{Foster2002,
+  author =       "George Foster",
+  title =        "Text Prediction for Translators",
+  school =       "Dept. IRO, Université de Montréal",
+  year =         "2002",
+}
+
+@incollection{Fox-2009,
+ title = {Nonparametric Bayesian Learning of Switching Linear Dynamical Systems},
+ author = {Emily Fox and Erik Sudderth and Michael Jordan and Alan Willsky},
+ booktitle = NIPS21,
+ editor = NIPS21ed,
+ pages = {457--464},
+ year = {2009}
+}
+
+@Article{Fralick67,
+  author = 	 {Stanley C. Fralick},
+  title = 	 {Learning to Recognize Patterns without a Teacher},
+  journal = 	 {IEEE Transactions on Information Theory},
+  year = 	 1967,
+  volume =	 13,
+  pages =	 {57-64}
+}
+
+@InProceedings{Franzini87,
+  author =       "M. A. Franzini",
+  booktitle =    "Proceedings of the Ninth Annual Conference of the IEEE
+                 Engineering in Medicine and Biology Society",
+  title =        "Speech Recognition with Back Propagation",
+  publisher =    "IEEE, New York",
+  address =      "Boston 1987",
+  pages =        "1702--1703",
+  year =         "1987",
+}
+
+@InProceedings{Franzini90,
+  author =       "M. A. Franzini and K. F. Lee and A. Waibel",
+  booktitle =    icassp,
+  title =        "Connectionist {Viterbi} Training: a New Hybrid Method
+                 for Continuous Speech Recognition",
+  address =      "Albuquerque, NM",
+  pages =        "425--428",
+  year =         "1990",
+}
+
+@InProceedings{Frasconi-icnn93,
+  author =       "P. Frasconi and M. Gori and A. Tesi",
+  booktitle =    icnn,
+  title =        "Backpropagation for Linearly Separable Patterns: a
+                 Detailed Analysis",
+  publisher =    "IEEE Press",
+  address =      "S. Francisco CA",
+  pages =        "1818--1822",
+  year =         "1993",
+}
+
+@InProceedings{Frasconi-ijcnn91,
+  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
+  booktitle =    ijcnn,
+  title =        "A Unified Approach for Integrating Explicit Knowledge
+                 and Learning by Example in Recurrent Networks",
+  pages =        "811--816",
+  year =         "1991",
+  OPTaddress =   "Seattle WA",
+}
+
+@Article{Frasconi-ijmpC93,
+  author =       "P. Frasconi and M. Gori and G. Soda",
+  title =        "Daphne: Data Parallelism Neural Network Simulator",
+  journal =      "Int. Journal of Modern Physics C",
+  volume =       "4",
+  number =       "1",
+  pages =        "17--28",
+  year =         "1993",
+  note =         "Special Issue: ``Science on the Connection Machine''",
+}
+
+@InProceedings{Frasconi-milano,
+  author =       "P. Frasconi and M. Gori and G. Soda",
+  booktitle =    "Computational Intelligence 90",
+  title =        "Recurrent Networks for Continuous Speech Recognition",
+  publisher =    "Elsevier",
+  address =      "Milano (Italy)",
+  year =         "1990",
+}
+
+@MastersThesis{Frasconi-msthesis,
+  author =       "P. Frasconi",
+  title =        "Progetto e realizzazione di un simulatore per reti
+                 neurali ricorrenti e implementazione di prototipi per
+                 il riconoscimento vocale in tempo reale",
+  school =       "Universit\`a di Firenze",
+  year =         "1990",
+  note =         "(in Italian)",
+}
+
+@Article{Frasconi-nc92,
+  author =       "P. Frasconi and M. Gori and G. Soda",
+  title =        "Local Feedback Multi-Layered Networks",
+  journal =      nc,
+  volume =       "4",
+  number =       "1",
+  pages =        "120--130",
+  year =         "1992",
+}
+
+@PhdThesis{Frasconi-PhD,
+  author =       "Paolo Frasconi",
+  title =        "Reti Ricorrenti ed Elaborazione Adattiva di Sequenze",
+  school =       "Universit\`a di Firenze",
+  address =      "Italy",
+  year =         "1994",
+  note =         "(in Italian)",
+}
+
+@InCollection{Frasconi-pinn93,
+  author =       "P. Frasconi and M. Gori and A. Tesi",
+  editor =       "Omid Omidvar",
+  booktitle =    "Progress in Neural Networks",
+  title =        "Successes and Failures of Backpropagation: a
+                 Theoretical Investigation",
+  publisher =    "Ablex Publishing",
+  year =         "1993",
+}
+
+@InProceedings{Frasconi-spie93,
+  author =       "Paolo Frasconi and Marco Gori",
+  editor =       "D. Ruck",
+  booktitle =    "Proc. Conf. Science of Artificial Neural Networks II",
+  title =        "Multilayered networks and the {C}-{G} uncertainty
+                 principle",
+  volume =       "SPIE-1966",
+  organization = "International Society for Optical Engineering (SPIE)",
+  address =      "Orlando, FL",
+  year =         "1993",
+}
+
+@TechReport{Frasconi-TR92,
+  author =       "P. Frasconi and M. Gori and G. Soda",
+  title =        "Injecting Nondeterministic Finite State Automata into
+                 Recurrent Neural Networks",
+  number =       "DSI-RT15/92",
+  institution =  "Universit\`a di Firenze (Italy)",
+  month =        aug,
+  year =         "1992",
+}
+
+@Unpublished{Frasconi-unp94,
+  author =       "P. Frasconi and Y. Bengio",
+  title =        "An {EM} Approach to Grammatical Inference",
+  year =         "1994",
+  note =         "Submitted to the 12-th {\em International Conference
+                 on Pattern Recognition}",
+  OPTannote =    "",
+}
+
+@InProceedings{Frasconi-v91,
+  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
+  editor =       "E. Caianiello",
+  booktitle =    "Proc. of the 4th Italian Workshop on Parallel
+                 Architectures and Neural Networks",
+  title =        "Learning Automata with Sigmoidal Networks",
+  publisher =    "World Scientific Pub",
+  address =      "Vietri (Italy)",
+  pages =        "69--77",
+  year =         "1991",
+}
+
+@InProceedings{Frasconi90,
+  author =       "P. Frasconi and M. Gori and G. Soda",
+  editor =       "E. Caianiello",
+  booktitle =    "Proc. of the 3rd Italian Workshop on Parallel
+                 Architectures and Neural Networks",
+  title =        "Recurrent Networks with Activation Feedback",
+  publisher =    "World Scientific Pub",
+  address =      "Vietri (Italy)",
+  pages =        "329--335",
+  year =         "1990",
+}
+
+@InProceedings{Frasconi97,
+  author =       "P. Frasconi and M. Gori and A. Sperduti",
+  booktitle =    "Proc. Int. Joint Conf. on Artificial Intelligence",
+  title =        "On the Efficient Classification of Data Structures by
+                 Neural Networks",
+  year =         "1997",
+}
+
+@Article{Frasconi-kde93,
+  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
+  title =        "Unified Integration of Explicit Rules and Learning by
+                 Example in Recurrent Networks",
+  journal =      ieeetrkde,
+  year =         "1993",
+  note =         "(in press)",
+}
+
+@Article{Frean90,
+  author =       "M Frean",
+  title =        "The Upstart Algorithm: {A} Method for Constructing and
+                 Training Feedforward Neural Networks",
+  journal =      nc,
+  volume =       "2",
+  pages =        "198--209",
+  year =         "1990",
+}
+
+@TechReport{Freund+Haussler-94,
+  author =       "Yoav Freund and David Haussler",
+  title =        "Unsupervised learning of distributions on binary
+                 vectors using two layer networks",
+  number =       "UCSC-CRL-94-25",
+  institution =  "University of California, Santa Cruz",
+  year =         "1994",
+}
+
+@InProceedings{Freund+Haussler92,
+  author =       "Yoav Freund and David Haussler",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "A fast and exact learning rule for a restricted class
+                 of {Boltzmann} machines",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "912--919",
+  year =         "1992",
+}
+
+@Article{Freund-Schapire-98,
+  author =       "Yoav Freund and Robert E. Schapire",
+  title =        "Adaptive Game Playing using Multiplicative Weights",
+  journal =      "Games and Economic Behavior",
+  year =         "1998",
+}
+
+@InProceedings{Freund1995,
+  author =       "Yoav Freund and Robert E. Schapire",
+  booktitle =    "Proceedings of the Second European Conference on
+                 Computational Learning Theory",
+  title =        "A decision-theoretic generalization of on-line
+                 learning and an application to boosting",
+  publisher =    "Springer-Verlag",
+  pages =        "23--37",
+  year =         "1995",
+  ISBN =         "3-540-59119-2",
+}
+
+@TechReport{freund94,
+  author =       "Y. Freund and D. Haussler",
+  title =        "Unsupervised learning of distributions of binary
+                 vectors using two layer networks",
+  number =       "CRL-94-25",
+  institution =  "UCSC",
+  year =         "1994",
+}
+
+@Unpublished{Freund97,
+  author =       "Y. Freund and R. E. Schapire and P. Bartlett and W. S.
+                 Lee",
+  title =        "Boosting the margin: {A} new explanation for the
+                 effectiveness of voting methods",
+  year =         "1997",
+  note =         "Presented at the Machines that Learn Conference,
+                 Snowbird, Utah",
+}
+
+@InProceedings{Frey96,
+  author =       "Brendan J. Frey and Geoffrey E. Hinton and Peter Dayan",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Does the wake-sleep algorithm learn good density estimators?",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "661--670",
+  year =         "1996",
+}
+
+@InProceedings{Frey-Hinton96,
+  author =       "B. J. Frey and G. E. Hinton",
+  booktitle =    "Proceedings of the Data Compression Conference",
+  title =        "Free Energy Coding",
+  publisher =    "IEEE Computer Society Press",
+  address =      "Los Alamitos, CA",
+  pages =        "",
+  year =         "1997",
+}
+
+@Book{Frey98,
+  author =       "Brendan J. Frey",
+  title =        "Graphical models for machine learning and digital
+                 communication",
+  publisher =    "{MIT} Press",
+  year =         "1998",
+}
+
+@InProceedings{frey99estimating,
+  author =       "B. J. Frey and N. Jojic",
+  booktitle =    cvpr99,
+  title =        "Estimating Mixture Models of Images and Inferring
+                 Spatial Transformations Using the {EM} Algorithm",
+  pages =        "416--422",
+  year =         "1999",
+  URL =          "citeseer.ist.psu.edu/frey99estimating.html",
+}
+
+@InProceedings{FreyUAI00,
+  author =       "Brendan Frey and Nebojsa Jojic",
+  booktitle =    UAI00,
+  title =        "Learning Graphical Models of Images, Videos and Their
+                 Spatial Transformations",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Francisco, CA",
+  pages =        "184--1",
+  year =         "2000",
+}
+
+@Article{Friedman+Fisher-99,
+  author =       "J. H. Friedman and N. I. Fisher",
+  title =        "Bump hunting in high-dimensional data",
+  journal =      "Statistics and Computing",
+  volume =       "9",
+  number =       "2",
+  pages =        "123--143",
+}
+
+@Article{Friedman+Hastie+Tibshirani:AdaBoost-theory,
+  author =       "J. Friedman and T. Hastie and R. Tibshirani",
+  title =        "Additive Logistic Regression: a Statistical View of
+                 Boosting",
+  journal =      "The Annals of Statistics",
+  volume =       "28",
+  pages =        "307--337",
+  year =         "2000",
+}
+
+@Article{Friedman-2001,
+  author =       "J. Friedman",
+  title =        "Greedy function approximation: a gradient boosting
+                 machine",
+  journal =      "Annals of Statistics",
+  volume =       "29",
+  pages =        "1180",
+  year =         "2001",
+}
+
+@Book{Friedman71,
+  author =       "A. Friedman",
+  title =        "Advanced Calculus",
+  publisher =    "Holt, Rinehart and Winston",
+  address =      "New York, NY",
+  year =         "1971",
+}
+
+@article{Friedman+Tukey-1974,
+    author = {J. H. Friedman and J. W. Tukey},
+    title = {A Projection Pursuit Algorithm for Exploratory Data Analysis},
+    journal = {IEEE Transactions on Computers},
+    volume = {23},
+    number = {9},
+    year = {1974},
+    issn = {0018-9340},
+    pages = {881--890},
+    doi = {http://dx.doi.org/10.1109/T-C.1974.224051},
+    publisher = {IEEE Computer Society},
+    address = {Washington, DC, USA},
+}
+
+@Article{Friedman87,
+  author =       "J. H. Friedman",
+  title =        "Exploratory projection pursuit",
+  journal =      "Journal of the American Statistical Association",
+  volume =       "92",
+  pages =        "249--266",
+  year =         "1987",
+}
+
+@Article{Friedman91,
+  author =       "J. H. Friedman",
+  title =        "Multivariate adaptive regression splines",
+  journal =      "The Annals of Statistics",
+  volume =       "19",
+  pages =        "1--141",
+  year =         "1991",
+}
+
+@TechReport{friedman94flexible,
+  author =       "J. Friedman",
+  title =        "Flexible metric nearest neighbor classification",
+  number =       "113",
+  institution =  "Stanford University Statistics Department",
+  year =         "1994",
+}
+
+@TechReport{Friedman98,
+  author =       "J. Friedman and T. Hastie and R. Tibshirani",
+  title =        "Additive logistic regression: {A} statistical view of
+                 boosting",
+  institution =  "Stanford University",
+  address =      "CA, USA",
+  year =         "1998",
+}
+
+@Misc{friedman99greedy,
+  author =       "J. Friedman",
+  title =        "Greedy Function Approximation: a Gradient Boosting
+                 Machine",
+  year =         "1999",
+  note =         "IMS 1999 Reitz Lecture, February 24, 1999, Dept. of
+                 Statistics, Stanford University",
+}
+
+@InProceedings{Friess98,
+  author =       "T. Friess and N. Cristianini and C. Campbel",
+  booktitle =    "Proceedings of the Fifteenth International Conference
+                 on Machine Learning",
+  title =        "The Kernel-Adatron: a Fast and Simple Learning
+                 Procedure for Support Vector Machines",
+  pages =        "188--196",
+  year =         "1998",
+}
+
+@InProceedings{Fritzke94,
+  author =       "B. Fritzke",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Supervised learning with growing cell structures",
+  publisher =    "Morgan Kaufmann",
+  year =         "1994",
+}
+
+@InProceedings{fs-lmcpa-98,
+  author =       "Yoav Freund and Robert E. Schapire",
+  booktitle =    "Proc. 11th Annu. Conf. on Comput. Learning Theory",
+  title =        "Large margin classification using the perceptron
+                 algorithm",
+  publisher =    "ACM Press, New York, NY",
+  pages =        "209--217",
+  year =         "1998",
+}
+
+@Article{fs-ppr-81,
+  author =       "J. H. Friedman and W. Stuetzle",
+  title =        "Projection Pursuit Regression",
+  journal =      "J. American Statistical Association",
+  volume =       "76",
+  number =       "376",
+  pages =        "817--823",
+  month =        dec,
+  year =         "1981",
+  comment =      "Good description of projection pursuit",
+}
+
+@Article{Fu86,
+  author =       "Y. Fu and P. W. Anderson",
+  title =        "Application of Statistical Mechanics to {NP}-Complete
+                 Problems in Combinatorial Optimization",
+  journal =      jpa,
+  volume =       "19",
+  pages =        "1605--1620",
+  year =         "1986",
+}
+
+@InProceedings{Fukumizu96,
+  author =       "K. Fukumizu",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Active Learning in Multilayer Perceptrons",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@Article{Fukumizu+Amari-2000,
+  author =      "Kenji Fukumizu and {Shun-ichi} Amari",
+  title =       "Local Minima and Plateaus in Hierarchical Structures of Multilayer Perceptrons",
+  journal =     "Neural Networks",
+  volume =      "13",
+  number =      "3",
+  pages =       "317--327",
+  year =        "2000",
+}
+
+@Article{Fukushima75,
+  author =       "K. Fukushima",
+  title =        "Cognitron: {A} Self-Organizing Multilayered Neural
+                 Network",
+  journal =      biocyb,
+  volume =       "20",
+  pages =        "121--136",
+  year =         "1975",
+}
+
+@Article{Fukushima80,
+  author =       "K. Fukushima",
+  title =        "Neocognitron: {A} Self-Organizing Neural Network Model
+                 for a Mechanism of Pattern Recognition Unaffected by
+                 Shift in Position",
+  journal =      biocyb,
+  volume =       "36",
+  pages =        "193--202",
+  year =         "1980",
+}
+
+@Article{Fukushima82,
+  author =       "K. Fukushima and S. Miyake",
+  key =          "Fukushima",
+  title =        "Neocognitron: {A} new algorithm for pattern
+                 recognition tolerant of deformations and shifts in
+                 position",
+  journal =      "Pattern Recognition",
+  volume =       "15",
+  pages =        "455--469",
+  year =         "1982",
+}
+
+@Article{Fukushima83,
+  author =       "K. Fukushima and S. Miyake and T. Ito",
+  title =        "Neocognitron: {A} Neural Network Model for a Mechanism
+                 of Visual Pattern Recognition",
+  journal =      ieeesmc,
+  volume =       "13",
+  year =         "1983",
+}
+
+@Article{Funahashi89,
+  author =       "K. Funahashi",
+  title =        "On the approximate realization of continuous mappings
+                 by neural networks",
+  journal =      "Neural Networks",
+  volume =       "2",
+  pages =        "183--192",
+  year =         "1989",
+}
+
+@Article{Funahashi93,
+  author =       "Ken-Ichi Funahashi and Yuichi Nakamura",
+  title =        "Approximation of Dynamical Systems by Continuous Time
+                 Recurrent Neural Networks",
+  journal =      nn,
+  volume =       "6",
+  pages =        "801--806",
+  year =         "1993",
+}
+
+@InProceedings{Fung-Crawford90,
+  author =       "R. M. Fung and S. L. Crawford",
+  booktitle =    "Eighth National Conference on Artificial Intelligence,
+                 Boston, Massachusetts, American Association for
+                 Artificial Intelligence",
+  title =        "A system for induction of probabilistic models",
+  pages =        "762--779",
+  year =         "1990",
+}
+
+@TechReport{Galland+Hinton89,
+  author =       "C. C. Galland and G. E. Hinton",
+  title =        "Deterministic learning in networks with asymmetric
+                 connectivity",
+  number =       "CRG-TR-89-6",
+  institution =  "Department of Computer Science, University of
+                 Toronto",
+  address =      "Toronto, Ontario",
+  year =         "1989",
+}
+
+@InProceedings{Gallant86,
+  author =       "S. I. Gallant",
+  booktitle =    "Eighth International Conference on Pattern
+                 Recognition",
+  title =        "Optimal Linear Discriminants",
+  publisher =    "IEEE, New York",
+  address =      "Paris 1986",
+  pages =        "849--852",
+  year =         "1986",
+}
+
+@Misc{gallant90perceptron-based,
+  author =       "S. Gallant",
+  title =        "Perceptron-based learning algorithms",
+  year =         "1990",
+  text =         "S. Gallant, Perceptron-based learning algorithms, IEEE
+                 Trans. Neural Networks 1, 179 (1990).",
+}
+
+@InProceedings{Gallinari87,
+  author =       "Patrick Gallinari and Yann {LeCun} and Sylvie Thiria and
+                 Francoise Fogelman-Soulie",
+  booktitle =    "Proceedings of COGNITIVA 87",
+  title =        "Memoires associatives distribuees",
+  address =      "Paris, La Villette",
+  year =         "1987",
+}
+
+@InProceedings{Gallinari88,
+  author =       "P. Gallinari and S. Thiria and F. Fogelman-Souli\'e",
+  booktitle =    "Proc. International Conference on Neural Networks
+                 '88",
+  title =        "Multilayer perceptrons and data analysis",
+  publisher =    "IEEE",
+  pages =        "391--399",
+  year =         "1988",
+}
+
+@InCollection{Gao-Goodman-Miao-2001,
+  author =       "J. Gao and J. Goodman and J. Miao",
+  booktitle =    "Computational Linguistics and Chinese Language
+                 Processing",
+  title =        "The Use of Clustering Techniques for Asian Language
+                 Modeling",
+  volume =       "6",
+  number =       "1",
+  pages =        "27--60",
+  year =         "2001",
+}
+
+@TechReport{Garcia-Perron95,
+  author =       "R. Garcia and P. Perron",
+  title =        "An analysis of the real interest rate under regime
+                 shift",
+  number =       "95s-5",
+  institution =  "CIRANO",
+  address =      "Montreal, Quebec, Canada",
+  year =         "1995",
+}
+
+@Article{Garcia-Perron96,
+  author =       "R. Garcia and P. Perron",
+  title =        "An analysis of the real interest rate under regime
+                 shift",
+  journal =      "The Review of Economics and Statistics",
+  year =         "1996",
+}
+
+@TechReport{Garcia-Schaller95,
+  author =       "R. Garcia and H. Schaller",
+  title =        "Are the effects of monetary policy asymmetric",
+  number =       "95s-6",
+  institution =  "CIRANO",
+  address =      "Montreal, Quebec, Canada",
+  year =         "1995",
+}
+
+@TechReport{Garcia95,
+  author =       "R. Garcia",
+  title =        "Asymptotic null distribution of the likelihood ratio
+                 test in Markov switching models",
+  number =       "95s-7",
+  institution =  "CIRANO",
+  address =      "Montreal, Quebec, Canada",
+  year =         "1995",
+}
+
+@TechReport{Garcia98,
+  author =       "R. Garcia and R. Gen\c{c}ay",
+  title =        "{Pricing and Hedging Derivative Securities with Neural
+                 Networks and a Homogeneity Hint}",
+  number =       "98s-35",
+  institution =  "CIRANO",
+  address =      "Montr\'eal, Qu\'ebec, Canada",
+  year =         "1998",
+}
+
+@Article{Gardner87,
+  author =       "E. Gardner",
+  title =        "Maximum Storage Capacity in Neural Networks",
+  journal =      eul,
+  volume =       "4",
+  pages =        "481--485",
+  year =         "1987",
+}
+
+@Article{Gardner88a,
+  author =       "E. Gardner",
+  title =        "The Space of Interactions in Neural Network Models",
+  journal =      jpa,
+  volume =       "21",
+  pages =        "257--270",
+  year =         "1988",
+}
+
+@Article{Gardner88b,
+  author =       "E. Gardner and B. Derrida",
+  title =        "Optimal Storage Properties of Neural Network Models",
+  journal =      jpa,
+  volume =       "21",
+  pages =        "271--284",
+  year =         "1988",
+}
+
+@Article{Gardner89a,
+  author =       "E. Gardner and B. Derrida",
+  title =        "Three Unfinished Works on the Optimal Storage Capacity
+                 of Networks",
+  journal =      jpa,
+  volume =       "22",
+  pages =        "1983--1994",
+  year =         "1989",
+}
+
+@Article{Gardner89b,
+  author =       "E. Gardner and H. Gutfreund and I. Yekutieli",
+  title =        "The Phase Space of Interactions in Neural Networks
+                 with Definite Symmetry",
+  journal =      jpa,
+  volume =       "22",
+  pages =        "1995--2008",
+  year =         "1989",
+}
+
+@Book{Garey79,
+  author =       "M. R. Garey and D. S. Johnson",
+  title =        "Computers and Intractability: {A} Guide to the Theory
+                 of {NP}-Completeness",
+  publisher =    "Freeman",
+  address =      "New York",
+  year =         "1979",
+}
+
+@InCollection{GarriguesP2008,
+  author =       "Pierre Garrigues and Bruno Olshausen",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Learning Horizontal Connections in a Sparse Coding
+                 Model of Natural Images",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "505--512",
+  year =         "2008",
+}
+
+@InCollection{GarriguesP2008-small,
+  author =       "Pierre Garrigues and Bruno Olshausen",
+  booktitle =    "NIPS'20",
+  title =        "Learning Horizontal Connections in a Sparse Coding
+                 Model of Natural Images",
+  year =         "2008",
+}
+
+@Article{Gartner03,
+  author =       "T. G{\"a}rtner",
+  title =        "A survey of kernels for structured data",
+  journal =      "ACM SIGKDD Explorations Newsletter",
+  volume =       "5",
+  number =       "1",
+  pages =        "49--58",
+  year =         "2003",
+}
+
+@InProceedings{Gauvain:2003:icassp,
+  author =       "Jean-Luc Gauvain and L. Lamel and Holger Schwenk and
+                 G. Adda and L. Chen and F.\ Lef\`evre",
+  booktitle =    icassp,
+  title =        "Conversational Telephone Speech Recognition",
+  volume =       "1",
+  pages =        "212--215",
+  year =         "2003",
+}
+
+@InProceedings{Gaynier93,
+  author =       "R. J. Gaynier and T. Downs",
+  booktitle =    "IEEE International Conference on Neural Networks",
+  title =        "A Method of Training Multi-layer Networks with
+                 Heaviside Characteristics Using Internal
+                 Representations",
+  address =      "San Francisco, CA",
+  pages =        "1812--1817",
+  year =         "1993",
+}
+
+@InProceedings{GehlerP2006,
+  author =       "Peter V. Gehler and Alex D. Holub and Max Welling",
+  booktitle =    ICML06,
+  editor =       ICML06ed,
+  publisher =    ICML06publ,
+  title =        "The rate adapting poisson model for information
+                 retrieval and object recognition",
+  address =      "New York, NY, USA",
+  pages =        "337--344",
+  year =         "2006",
+  ISBN =         "1-59593-383-2",
+  doi =          "http://doi.acm.org/10.1145/1143844.1143887",
+  location =     "Pittsburgh, Pennsylvania",
+}
+
+@Article{Geman84,
+  author =       {Geman, Stuart and Geman, Donald},
+  title =        "Stochastic Relaxation, Gibbs Distributions, and the
+                 {Bayesian} Restoration of Images",
+  doi =          {10.1080/02664769300000058},
+  journal =      ieeetpami,
+  volume =       "6",
+  keywords =     {annealing, mrf, simulated},
+  month =        {November},
+  pages =        {721--741},
+  url =          {http://dx.doi.org/10.1080/02664769300000058},
+  year =         "1984",
+}
+
+@Article{Geman92,
+  author =       "S. Geman and E. Bienenstock and R. Doursat",
+  title =        "Neural Networks and the Bias/Variance Dilemma",
+  journal =      nc,
+  volume =       "4",
+  number =       "1",
+  pages =        "1--58",
+  year =         "1992",
+}
+
+@Article{Genest-Zideck-86,
+  author =       "C. Genest and J. V. Zideck",
+  title =        "Combining probability distributions: {A} critique and
+                 an annotated bibliography",
+  journal =      "Statistical Science",
+  volume =       "1",
+  pages =        "114--148",
+  year =         "1986",
+}
+
+@article{Geng+al-2005,
+    author    = {Xin Geng and De-Chuan Zhan and Zhi-Hua Zhou},
+    title     = {Supervised nonlinear dimensionality reduction for visualization and classification},
+    journal   = {IEEE Transactions on Systems, Man, and Cybernetics, Part B},
+    volume    = {35},
+    number    = {6},
+    year      = {2005},
+    pages     = {1098-1107},
+    ee        = {http://dx.doi.org/10.1109/TSMCB.2005.850151},
+    bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@Article{Geszti87,
+  author =       "T. Geszti and F. P\'azm\'andi",
+  title =        "Learning Within Bounds and Dream Sleep",
+  journal =      jpa,
+  volume =       "20",
+  pages =        "L1299--L1303",
+  year =         "1987",
+}
+
+@Book{Geszti90,
+  author =       "T. Geszti",
+  title =        "Physical Models of Neural Networks",
+  publisher =    "World Scientific",
+  address =      "Singapore",
+  year =         "1990",
+}
+
+@Article{Geweke1989,
+  author =       "J. Geweke",
+  title =        "Bayesian inference in econometric models using Monte
+                 carlo integration",
+  journal =      "Econometrica",
+  volume =       "57",
+  pages =        "1317--1339",
+  year =         "1989",
+}
+
+@InCollection{Gha94,
+  author =       "Z. Ghahramani",
+  booktitle =    "Proceedings of the 1993 Connectionist Models Summer
+                 School",
+  title =        "Solving inverse problems using an {EM} approach to
+                 density estimation",
+  publisher =    "Erlbaum",
+  address =      "Hillsdale, NJ",
+  year =         "1994",
+}
+
+@InProceedings{ghabea00,
+  author =       "Z. Ghahramani and M. J. Beal",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Variational inference for {Bayesian} mixtures of
+                 factor analysers",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2000",
+  URL =          "citeseer.nj.nec.com/article/ghahramani00variational.html",
+}
+
+@TechReport{ghahramani96em,
+  author =       "Z. Ghahramani and G. E. Hinton",
+  title =        "The {EM} Algorithm for Mixtures of Factor Analyzers",
+  number =       "CRG-TR-96-1",
+  institution =  "Dpt. of Comp. Sci., Univ. of Toronto",
+  month =        jan,
+  year =         "1996",
+  URL =          "citeseer.nj.nec.com/ghahramani97em.html",
+}
+
+@TechReport{GhaJor93,
+  author =       "Z. Ghahramani and M. I. Jordan",
+  title =        "Function approximation via density estimation",
+  type =         "Computational Cognitive Science",
+  number =       "TR 9304",
+  institution =  "MIT",
+  address =      "Cambridge, MA",
+  year =         "1993",
+}
+
+@InProceedings{Gherrity89,
+  author =       "M. Gherrity",
+  booktitle =    ijcnn,
+  title =        "A Learning Algorithm for Analog, Fully Recurrent
+                 Neural Networks,",
+  publisher =    "IEEE Press",
+  address =      "Washington D.C.",
+  pages =        "643--644",
+  month =        jun,
+  year =         "1989",
+}
+
+@Article{Ghosh+Hwang-1989,
+  author =       "J. Ghosh and K. Hwang",
+  title =        "Mapping Neural Networks onto Message-Passing
+                 Multicomputers",
+  journal =      "Journal of Parallel and Distributed Computing",
+  volume =       "6",
+  number =       "2",
+  publisher =    "Academic Press",
+  pages =        "291--330",
+  year =         "1989",
+}
+
+@Article{Ghosn2003,
+  author =       "J. Ghosn and Y. Bengio",
+  title =        "Bias Learning, Knowledge Sharing",
+  journal =      "{IEEE} Transactions on Neural Networks",
+  volume =       "14",
+  pages =        "748--765",
+  month =        jul,
+  year =         "2003",
+  issue =        "4",
+}
+
+@TechReport{Ghysel93,
+  author =       "E. Ghysel",
+  title =        "A time series model with periodic stochastic regime
+                 switching",
+  number =       "C.R.D.E. Discussion paper 1093",
+  institution =  "C.R.D.E., Universite de Montreal",
+  address =      "Montreal, Quebec, Canada",
+  year =         "1993",
+}
+
+@book{Giarratano+Riley-2004,
+    author = {Giarratano, Joseph  C.  and Riley, Gary  D. },
+    howpublished = {Hardcover},
+    isbn = {0534384471},
+    month = {October},
+    posted-at = {2008-05-19 22:17:30},
+    priority = {2},
+    publisher = {{Course Technology}},
+    edition = {Fourth},
+    title = {Expert Systems: Principles and Programming},
+    url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0534384471},
+    year = {2004}
+}
+
+
+@Article{Giles86,
+  author =       "Y. C. Lee and G. Doolen and H. H. Chen and G. Z. Sun
+                 and T. Maxwell and H. Y. Lee and C. L. Giles",
+  title =        "Machine Learning Using a Higher Order Correlation
+                 Network",
+  journal =      "Physica D",
+  volume =       "2",
+  number =       "1-3",
+  pages =        "276",
+  year =         "1986",
+}
+
+@article{giles:1987, 
+    author = {C. Lee Giles and Tom Maxwell}, 
+    journal = {Applied Optics}, 
+    keywords = {},
+    number = {23}, 
+    pages = {4972}, 
+    publisher = {OSA},
+    title = {Learning, Invariance, and Generalization in High-Order Neural Networks}, 
+    volume = {26}, 
+    year = {1987},
+    url = {http://ao.osa.org/abstract.cfm?URI=ao-26-23-4972},
+}
+
+@InProceedings{Giles90,
+  author =       "C. L. Giles and G. Z. Sun and H. H. Chen and Y. C. Lee
+                 and D. Chen",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Higher Order Recurrent Networks \& Grammatical
+                 Inference",
+  publisher =    "Morgan Kaufmann Publishers",
+  address =      "San Mateo, CA",
+  pages =        "380--387",
+  year =         "1990",
+}
+
+@InProceedings{Giles-nnsp92,
+  author =       "C. L. Giles and C. W Omlin",
+  editor =       "Kung and Fallside and Sorenson and Kamm",
+  booktitle =    "Neural Networks for Signal Processing II, Proceedings
+                 of the 1992 IEEE workshop",
+  title =        "Inserting Rules into Recurrent Neural Networks",
+  publisher =    "IEEE Press",
+  pages =        "13--22",
+  year =         "1992",
+}
+
+@Article{Giles94,
+  author =       "C. L. Giles and C. W. Omlin",
+  title =        "Extraction, Insertion and Refinement of Symbolic Rules
+                 in Dynamically-Driven Recurrent Neural Networks",
+  journal =      "Connection Science",
+  pages =        "",
+  year =         "1994",
+}
+
+@Article{Giles-nc92,
+  author =       "C. L. Giles and C. B. Miller and D. Chen and G. Z. Sun
+                 and H. H. Chen and Y. C. Lee",
+  title =        "Learning and Extracting Finite State Automata with
+                 Second-Order Recurrent Neural Networks",
+  journal =      nc,
+  volume =       "4",
+  number =       "3",
+  pages =        "393--405",
+  year =         "1992",
+}
+
+@Book{Gill81,
+  author =       "P. E. Gill and W. Murray and M. H. Wright",
+  title =        "Practical Optimization",
+  publisher =    "Academic Press",
+  year =         "1981",
+}
+
+@InProceedings{Gillman+Sipser94,
+  author =       "David Gillman and Michael Sipser",
+  booktitle =    colt94,
+  title =        "Inference and minimization of hidden Marko chains",
+  publisher =    "ACM",
+  pages =        "147--158",
+  year =         "1994",
+}
+
+@Book{Gilmore-74,
+  author =       "R. Gilmore",
+  title =        "{Lie} groups, {Lie} algebras and some of their
+                 applications",
+  publisher =    "Wiley",
+  address =      "New-York",
+  year =         "1974",
+}
+
+@InProceedings{Gingras-Bengio-Nadeau-2000,
+  author =       "F. Gingras and Y. Bengio and C. Nadeau",
+  editor =       "",
+  booktitle =    "Computational Finance 2000",
+  title =        "On Out-of-Sample Statistics for Time-Series",
+  publisher =    "",
+  location =     "London, U.K.",
+  pages =        "",
+  year =         "2000",
+}
+
+@InProceedings{chapados+bengio-2000,
+  author =       "N. Chapados and Y. Bengio",
+  editor =       "",
+  booktitle =    "Computational Finance 2000",
+  title =        "{VaR}-based Asset Allocation using Neural Networks",
+  publisher =    "",
+  pages =        "",
+  year =         "2000",
+}
+
+@InProceedings{Pigeon+Bengio-99,
+  author =       "S. Pigeon and Y. Bengio",
+  editor =       "",
+  booktitle =    "Proceedings of the Data Compression Conference, DCC'1999",
+  title =        "Binary Pseudowavelets and Application to Bilevel Image Processing",
+  publisher =    "",
+  pages =        "",
+  year =         "1999",
+}
+
+@InProceedings{Girard+Paugam-Moisy-1994,
+  author =       "D. Girard and H\'{e}l\`{e}ne Paugam-Moisy",
+  booktitle =    "Proceedings of the {IFIP} {WG10.3} Working Conference
+                 on Applications in Parallel and Distributed Computing",
+  title =        "Strategies of Weight Updating for Parallel
+                 Back-propagation",
+  publisher =    "North-Holland Publishing Co.",
+  address =      "Amsterdam, The Netherlands",
+  pages =        "335--336",
+  year =         "1994",
+  ISBN =         "0-444-81870-7",
+}
+
+@InProceedings{Girju+al-2003,
+  author =       "Roxana Girju and Adriana Badulescu and Dan Moldovan",
+  booktitle =    "NAACL '03: Proceedings of the 2003 Conference of the
+                 North American Chapter of the Association for
+                 Computational Linguistics on Human Language
+                 Technology",
+  title =        "Learning semantic constraints for the automatic
+                 discovery of part-whole relations",
+  publisher =    "Association for Computational Linguistics",
+  address =      "Morristown, NJ, USA",
+  pages =        "1--8",
+  year =         "2003",
+  location =     "Edmonton, Canada",
+}
+
+@Article{Girolami-2001,
+  author =       "M. Girolami",
+  title =        "Orthogonal series density estimation and the kernel
+                 eigenvalue problem",
+  journal =      "Neural Computation",
+  volume =       "14",
+  number =       "3",
+  pages =        "669--688",
+  year =         "2001",
+}
+
+@Misc{girosi97an,
+  author =       "F. Girosi",
+  title =        "An equivalence between sparse approximation and
+                 Support Vector Machines",
+  year =         "1997",
+  text =         "F. Girosi. An equivalence between sparse approximation
+                 and Support Vector Machines. A.I. Memo 1606, MIT
+                 Artificial Intelligence Laboratory, 1997. (available at
+                 the URL:
+                 http://www.ai.mit.edu/people/girosi/svm.html).",
+}
+
+@Article{Glauber63,
+  author =       "R. J. Glauber",
+  title =        "Time-Dependent Statistics of the Ising Model",
+  journal =      jmp,
+  volume =       "4",
+  pages =        "294--307",
+  year =         "1963",
+}
+
+@Book{GLM-book-89,
+  author =       "P. McCullagh and J. Nelder",
+  title =        "Generalized Linear Models",
+  publisher =    "Chapman and Hall",
+  address =      "London",
+  year =         "1989",
+}
+
+@InCollection{GlobersonA2006,
+  author =       "Amir Globerson and Sam Roweis",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Metric Learning by Collapsing Classes",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "451--458",
+  year =         "2006",
+}
+
+@Book{Gluck90,
+  author =       "M. A. Gluck and D. E. Rumelhart",
+  title =        "Neuroscience and connectionist theory",
+  publisher =    "Lawrence Erlbaum, London",
+  year =         "1990",
+}
+
+@Article{Godin89,
+  author =       "C. Godin and P. Lockwood",
+  title =        "{DTW} Schemes for Continuous Speech Recognition: {A}
+                 Unified view",
+  journal =      cspla,
+  volume =       "3",
+  pages =        "169--198",
+  year =         "1989",
+}
+
+@book{Gold+Morgan-1999,
+    author = {Gold, Ben and Morgan, Nelson},
+    howpublished = {Hardcover},
+    isbn = {0471351547},
+    month = {July},
+    publisher = {Wiley},
+    title = {Speech and Audio Signal Processing: Processing and Perception of Speech and Music},
+    year = {1999}
+}
+
+@Book{Goldberg89,
+  author =       "D. E. Goldberg",
+  title =        "Genetic Algorithms in Search, Optimization, and
+                 Machine Learning",
+  publisher =    "Addison-Wesley",
+  address =      "Reading",
+  year =         "1989",
+}
+
+@Article{Goldfeld73,
+  author =       "S. M. Goldfeld and R. M. Quandt",
+  title =        "A Markov model for switching regressions",
+  journal =      "Journal of Econometrics",
+  volume =       "1",
+  pages =        "3--16",
+  year =         "1973",
+}
+
+@TechReport{Goldhor85,
+  author =       "R. S. Goldhor",
+  title =        "Representation of consonants in the peripheral
+                 auditory system: {A} modeling study of the
+                 correspondance between response properties and phonetic
+                 features",
+  number =       "505",
+  institution =  "RLE.",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1985",
+}
+
+@Article{Golomb90,
+  author =       "D. Golomb and N. Rubin and H. Sompolinsky",
+  title =        "Willshaw Model: Associative Memory with Sparse Coding
+                 and Low Firing Rates",
+  journal =      prA,
+  volume =       "41",
+  pages =        "1843--1854",
+  year =         "1990",
+}
+
+@Book{Golub+VanLoan-1996,
+  author =       "Gene H. Golub and Charles F. Van Loan",
+  title =        "Matrix Computations",
+  howpublished = "Paperback",
+  publisher =    "{The Johns Hopkins University Press}",
+  month =        oct,
+  year =         "1996",
+  ISBN =         "0-8018-5414-8",
+}
+
+@TechReport{Goodman-LM-2001,
+  author =       "Joshua Goodman",
+  title =        "A Bit of Progress in Language Modeling",
+  number =       "MSR-TR-2001-72",
+  institution =  "Microsoft Research",
+  address =      "Redmond, Washington",
+  year =         "2001",
+}
+
+@InProceedings{Goodman2001,
+  author =       "J. Goodman",
+  booktitle =    icassp,
+  title =        "Classes for Fast Maximum Entropy Training",
+  address =      "Utah",
+  year =         "2001",
+}
+
+@InProceedings{Gori-ijcnn89,
+  author =       "M. Gori and Y. Bengio and R. \mbox{De Mori}",
+  booktitle =    ijcnn,
+  title =        "{BPS}: {A} Learning Algorithm for Capturing the
+                 Dynamical Nature of Speech",
+  publisher =    "IEEE, New York",
+  address =      "Washington D.C.",
+  pages =        "643--644",
+  year =         "1989",
+}
+
+@InProceedings{Gori-nimes89,
+  author =       "M Gori",
+  booktitle =    "Proceedings of Neuro-Nimes",
+  title =        "An Extension of {BPS}",
+  address =      "Nimes (France)",
+  pages =        "83--93",
+  year =         "1989",
+}
+
+@Article{Gori-pami91,
+  author =       "M. Gori and A. Tesi",
+  title =        "On the problem of local minima in Backpropagation",
+  journal =      ieeetpami,
+  volume =       "PAMI-14",
+  number =       "1",
+  pages =        "76--86",
+  year =         "1992",
+}
+
+@TechReport{Gori-tr94,
+  author =       "M. Gori and M. Maggini and G. Soda",
+  title =        "Insertion of Finite State Automata into Recurrent
+                 Radial Basis Function Networks",
+  number =       "DSI-17/93",
+  institution =  "Universit\`a di Firenze (Italy)",
+  year =         "1993",
+  note =         "(submitted)",
+  OPTannote =    "",
+}
+
+@InProceedings{GoriNimes,
+  author =       "M. Gori",
+  booktitle =    "Proceedings of Neuro-Nimes",
+  title =        "An Extension of {BPS}",
+  address =      "Nimes (France)",
+  pages =        "83--93",
+  month =        nov,
+  year =         "1989",
+}
+
+@Article{Gorman88a,
+  author =       "R. P. Gorman and T. J. Sejnowski",
+  title =        "Analysis of Hidden Units in a Layered Network Trained
+                 to Classify Sonar Targets",
+  journal =      nn,
+  volume =       "1",
+  pages =        "75--89",
+  year =         "1988",
+}
+
+@Article{Gorman88b,
+  author =       "R. P. Gorman and T. J. Sejnowski",
+  title =        "Learned Classification of Sonar Targets Using a
+                 Massively-Parallel Network",
+  journal =      ieeetassp,
+  volume =       "36",
+  pages =        "1135--1140",
+  year =         "1988",
+}
+
+@Unpublished{Gorse94,
+  author =       "D. Gorse and J. G. Taylor and T. G. Clarkson",
+  title =        "A pulse-based reinforcement algorithm for learning
+                 continuous functions",
+  year =         "1994",
+  note =         "Submitted to WCNN '94 San Diego",
+}
+
+@Article{Goudreau-trnn93,
+  author =       "M. W. Goudreau and C. L. Giles and S. T. Chakradhar
+                 and D. Chen",
+  title =        "First-order vs. second-order single layer recurrent
+                 neural networks",
+  journal =      ieeetrnn,
+  year =         "1993",
+  note =         "(in press)",
+}
+
+@Article{Goudreau93tb,
+  author =       "M. W. Goudreau and C. L. Giles and S. T. Chakradhar
+                 and D. Chen",
+  title =        "First-Order Vs. Second-Order Single Layer Recurrent
+                 Neural Networks",
+  journal =      "IEEE Transactions on Neural Networks",
+  year =         "1993",
+}
+
+@inproceedings{Gould+al:NIPS09,
+  author = {S. Gould and T. Gao and D. Koller},
+  title = {Region-based Segmentation and Object Detection},
+  booktitle =    "Advances in Neural Information Processing Systems (NIPS 2009)",
+  year = 2009,
+}
+
+@Article{goutte97,
+  author =       "C. Goutte",
+  title =        "Note on free lunches and cross-validation",
+  journal =      "Neural Computation",
+  volume =       "9",
+  number =       "6",
+  pages =        "1053--1059",
+  year =         "1997",
+}
+
+@Article{Gower-68,
+  author =       "J. C. Gower",
+  title =        "Adding a point to vector diagrams in multivariate
+                 analysis",
+  journal =      "Biometrika",
+  volume =       "55",
+  number =       "3",
+  pages =        "582--585",
+  year =         "1968",
+}
+
+@InProceedings{Graepel2000,
+  author =       "Thore Graepel and Ralf Herbrich and John
+                 Shawe-Taylor",
+  booktitle =    "Thirteenth Annual Conference on Computational Learning
+                 Theory, 2000",
+  title =        "Generalization error bounds for sparse linear
+                 classifiers",
+  publisher =    "Morgan Kaufmann",
+  year =         "2000",
+  note =         "in press",
+}
+
+@InProceedings{Graepel99,
+  author =       "T. Graepel and R. Herbrich and P. Bollmann-Sdorra and
+                 K. Obermayer",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Classification on Pairwise Proximity Data",
+  year =         "1999",
+}
+
+@InProceedings{graf-90a,
+  author =       "H. P. Graf and D. Henderson",
+  booktitle =    "ISSCC Digest",
+  title =        "A Reconfigurable {CMOS} Neural Network",
+  organization = "ISSCC",
+  year =         "1990",
+}
+
+@InProceedings{Graf86,
+  author =       "H. P. Graf and L. D. Jackel and R. E. Howard and B.
+                 Straughn and J. S. Denker and W. Hubbard and D. M.
+                 Tennant and D. Schwartz",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "{VLSI} Implementation of a Neural Network Memory with
+                 Several Hundreds of Neurons",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "182--187",
+  year =         "1986",
+}
+
+@InProceedings{Graf88,
+  author =       "D. H. Graf and W. R. LaLonde",
+  booktitle =    icnn,
+  title =        "A Neural Controller for Collision-Free Movement of
+                 General Robot Manipulators",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "77--84",
+  year =         "1988",
+}
+
+@InProceedings{Graf92,
+  author =       "H. P. Graf and C. R. Nohl and J. Ben",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Image segmentation with networks of variable scales",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "480--487",
+  year =         "1992",
+}
+
+@InProceedings{Grandvalet98a,
+  author =       "Y. Grandvalet",
+  editor =       "L. Niklasson and M. Boden and T. Ziemske",
+  booktitle =    "ICANN'98",
+  title =        "Least absolute shrinkage is equivalent to quadratic
+                 penalization",
+  volume =       "1",
+  publisher =    "Springer",
+  pages =        "201--206",
+  year =         "1998",
+  series =       "Perspectives in Neural Computing",
+}
+
+@InProceedings{Grandvalet98a-short,
+  author =       "Y. Grandvalet",
+  booktitle =    "ICANN'98",
+  title =        "Least absolute shrinkage is equivalent to quadratic
+                 penalization",
+  pages =        "201--206",
+  year =         "1998",
+}
+
+@InProceedings{GrandvaletY2005,
+  author =       "Yves Grandvalet and Yoshua Bengio",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "{Semi-supervised Learning by Entropy
+                 Minimization}",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  month =        dec,
+  year =         "2005",
+}
+%deprecate this version as we need to put the date of publication not the date of the conference. use GrandvaletY2005 instead.
+@InProceedings{GrandvaletY2004,
+  author =       "Yves Grandvalet and Yoshua Bengio",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "{Semi-supervised Learning by Entropy
+                 Minimization}",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  month =        dec,
+  year =         "2005",
+}
+
+@INCOLLECTION {GrandvaletY2006,
+title = {Entropy Regularization},
+author = {Grandvalet, Yves and Bengio, Yoshua},
+editor = {Chapelle, Olivier and {Sch\"{o}lkopf}, Bernhard and Zien, Alexander},
+booktitle = {Semi-Supervised Learning},
+year = {2006},
+pages = {151--168},
+publisher = {{MIT} Press},
+}
+
+@Article{GrangerNewbold76,
+  author =       "C. W. J. Granger and P. Newbold",
+  title =        "Forecasting transformed series",
+  journal =      "J. Roy. Statist. Soc. B",
+  volume =       "38",
+  pages =        "189--203",
+  year =         "1976",
+}
+
+@InProceedings{Gray-Moore-2003,
+  author =       "Alexander Gray and Andrew Moore",
+  booktitle =    "Artificial Iintelligence and Statistics",
+  title =        "Rapid Evaluation of Multiple Density Models",
+  year =         "2003",
+}
+
+@Article{Gray84,
+  author =       "R. M. Gray",
+  title =        "Vector Quantization",
+  journal =      ieeeassp,
+  pages =        "4--29",
+  month =        apr,
+  year =         "1984",
+}
+
+@Article{Greenwood+Durand60,
+  author =       "T. A. Greenwood and D. Durand",
+  title =        "",
+  journal =      "Technometrics",
+  volume =       "2",
+  pages =        "55--56",
+  year =         "1960",
+}
+
+@InProceedings{GregoryD2007,
+  author =       "Gregory Druck and Chris Pal and Andrew Mccallum and
+                 Xiaojin Zhu",
+  booktitle =    "KDD '07: Proceedings of the 13th ACM SIGKDD
+                 international conference on Knowledge discovery and
+                 data mining",
+  title =        "Semi-supervised classification with hybrid
+                 generative/discriminative methods",
+  publisher =    "ACM",
+  address =      "New York, NY, USA",
+  pages =        "280--289",
+  year =         "2007",
+  OPTciteulike-article-id = "2304687",
+  OPTdoi =       "10.1145/1281192.1281225",
+  OPTisbn =      "9781595936097",
+  OPTkeywords =  "classification",
+  OPTpriority =  "2",
+}
+  %url =       "http://portal.acm.org/citation.cfm?id=1281192.1281225",
+
+@Article{Gribskov87,
+  author =       "M. Gribskov and M. McLachlan and D. Eisenber",
+  title =        "Profile analysis: detection of distantly related
+                 proteins",
+  journal =      PNAS,
+  volume =       "84",
+  pages =        "4355--4358",
+  year =         "1987",
+}
+
+@TechReport{Griffin-Holub-Perona-07,
+  author =       "Gregory Griffin and Alex Holub and Pietro Perona",
+  title =        "Caltech-256 Object Category Dataset",
+  number =       "Technical Report 7694",
+  institution =  "California Institute of Technology",
+  year =         "2007",
+}
+
+@Article{grigoriev95,
+  author =       "Dima Grigoriev and Marek Karpinski and Andrew Chi-Chih
+                 Yao",
+  title =        "An Exponential Lower Bound on the Size of Algebraic
+                 Decision Trees for {MAX}",
+  journal =      "Electronic Colloquium on Computational Complexity
+                 (ECCC)",
+  volume =       "2",
+  number =       "057",
+  year =         "1995",
+}
+
+@Article{Grimes-Rao-2005,
+  author =       "D. B. Grimes and R. P. N. Rao",
+  title =        "Bilinear Sparse Coding for Invariant Vision",
+  journal =      "Neural Computation",
+  volume =       "17",
+  number =       "1",
+  pages =        "47--73",
+  year =         "2005",
+}
+
+@Article{Grossberg67,
+  author =       "S. Grossberg",
+  title =        "Nonlinear Difference-Differential Equations in
+                 Prediction and Learning Theory",
+  journal =      PNAS,
+  volume =       "58",
+  pages =        "1329--1334",
+  year =         "1967",
+}
+
+@Article{Grossberg68a,
+  author =       "S. Grossberg",
+  title =        "Some Nonlinear Networks Capable of Learning a Spatial
+                 Pattern of Arbitrary Complexity",
+  journal =      PNAS,
+  volume =       "59",
+  pages =        "368--372",
+  year =         "1968",
+}
+
+@Article{Grossberg68b,
+  author =       "S. Grossberg",
+  title =        "Some Physiological and Biochemical Consequences of
+                 Psychological Postulates",
+  journal =      PNAS,
+  volume =       "60",
+  pages =        "758--765",
+  year =         "1968",
+}
+
+@Article{Grossberg69,
+  author =       "S. Grossberg",
+  title =        "Embedding Fields: {A} Theory of Learning with
+                 Physiological Implications",
+  journal =      jmpsych,
+  volume =       "6",
+  pages =        "209--239",
+  year =         "1969",
+}
+
+@Article{Grossberg72,
+  author =       "S. Grossberg",
+  title =        "Neural Expectation: Cerebellar and Retinal Analogs of
+                 Cells Fired by Learnable or Unlearned Pattern Classes",
+  journal =      kyb,
+  volume =       "10",
+  pages =        "49--57",
+  year =         "1972",
+}
+
+@Article{Grossberg76a,
+  author =       "S. Grossberg",
+  title =        "Adaptive Pattern Classification and Universal
+                 Recoding: {I}. Parallel Development and Coding of
+                 Neural Feature Detectors",
+  journal =      biocyb,
+  volume =       "23",
+  year =         "1976",
+}
+
+@Article{Grossberg76b,
+  author =       "S. Grossberg",
+  title =        "Adaptive Pattern Classification and Universal
+                 Recoding: {II}. Feedback, Expectation, Olfaction,
+                 Illusions",
+  journal =      biocyb,
+  volume =       "23",
+  pages =        "187--202",
+  year =         "1976",
+}
+
+@Article{Grossberg80,
+  author =       "S. Grossberg",
+  title =        "How Does the Brain Build a Cognitive Code?",
+  journal =      psyrev,
+  volume =       "87",
+  year =         "1980",
+}
+
+@Book{Grossberg87a,
+  author =       "S. Grossberg",
+  title =        "The Adaptive Brain",
+  volume =       "1--2",
+  publisher =    "Elsevier",
+  address =      "Amsterdam",
+  year =         "1987",
+}
+
+@Article{Grossberg87b,
+  author =       "S. Grossberg",
+  title =        "Competitive Learning: From Interactive Activation to
+                 Adaptive Resonance",
+  journal =      cogsci,
+  volume =       "11",
+  pages =        "23--63",
+  year =         "1987",
+}
+
+@inproceedings{Grosse-2007,
+ author = {Roger Grosse and Rajat Raina and Helen Kwong and Andrew Y. Ng},
+ title = {Shift-Invariant Sparse Coding for Audio Classification}, 
+ booktitle = UAI07,
+ year = 2007,
+}
+
+@InProceedings{Grossman-nips89,
+  author =       "T. Grossman R. Meir and E. Domany",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Learning by choice of internal representation",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "73--80",
+  year =         "1989",
+}
+
+@Article{Grossman89,
+  author =       "T. Grossman and R. Meir and E. Domany",
+  title =        "Learning by Choice of Internal Representations",
+  journal =      cs,
+  volume =       "2",
+  pages =        "555--575",
+  year =         "1989",
+}
+
+@InProceedings{Grossman90,
+  author =       "T. Grossman",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "The {CHIR} Algorithm for Feed Forward Networks with
+                 Binary Weights",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "516--523",
+  year =         "1990",
+}
+
+@Article{Guillery2005,
+  author =       "R. W. Guillery",
+  title =        "Is postnatal neocortical maturation hierarchical?",
+  journal =      "Trends in Neuroscience",
+  volume =       "28",
+  number =       "10",
+  pages =        "512--517",
+  month =        oct,
+  year =         "2005",
+}
+
+@InCollection{Gull88,
+  author =       "S. F. Gull",
+  editor =       "G. Erickson and C. Smith",
+  booktitle =    "Maximum Entropy and {Bayesian} Methods in Science and
+                 Engineering",
+  title =        "{Bayesian} inductive inference and maximum entropy",
+  volume =       "1",
+  publisher =    "Kluwer",
+  address =      "Dordrecht",
+  pages =        "53--74",
+  year =         "1988",
+}
+
+@Article{gullapalli:nn:1990,
+  author =       "V. Gullapalli",
+  title =        "A Stochastic Reinforcement Learning Algorithm for
+                 Learning Real-Valued Functions",
+  journal =      nn,
+  volume =       "3",
+  pages =        "671--692",
+  year =         "1990",
+}
+
+@Article{Gunn+Kandola01,
+  author =       "S. R. Gunn and J. Kandola",
+  title =        "Structural Modelling with Sparse Kernels",
+  journal =      "Machine Learning",
+  volume =       "special issue on New Methods for Model Combination and
+                 Model Selection",
+  year =         "2001",
+  note =         "to appear",
+}
+
+@inproceedings{Guo+Schuurmans-2007,
+author = "Guo, Y. and Schuurmans, D.",
+title = "Convex relaxations of latent variable training",
+editor =    NIPS20ed,
+booktitle = NIPS20,
+year = 2007,
+}
+
+@inproceedings{guoschuurmans07b,
+author = "Guo, Y. and Schuurmans, D.",
+title = "Discriminative batch mode active learning",
+editor =    NIPS20ed,
+booktitle = NIPS20,
+year = 2007,
+}
+
+@inproceedings{Guo+Schuurmans-2008,
+author = "Guo, Y. and Schuurmans, D.",
+title = "Efficient global optimization for exponential family {PCA} and 
+low-rank matrix factorization",
+booktitle = "Proceedings of the Forty-sixth Annual Allerton Conference on
+Communication, Control, and Computing (Allerton)",
+year = 2008,
+}
+
+@Article{Gutfreund88a,
+  author =       "H. Gutfreund",
+  title =        "Neural Networks with Hierarchically Correlated
+                 Patterns",
+  journal =      prA,
+  volume =       "37",
+  pages =        "570--577",
+  year =         "1988",
+}
+
+@Article{Gutfreund88b,
+  author =       "H. Gutfreund and M. M\'ezard",
+  title =        "Processing of Temporal Sequences in Neural Networks",
+  journal =      prl,
+  volume =       "61",
+  pages =        "235--238",
+  year =         "1988",
+}
+
+@InProceedings{Gutzmann87,
+  author =       "K. Gutzmann",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Combinatorial Optimization Using a Continuous State
+                 {Boltzmann} Machine",
+  volume =       "3",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "721--734",
+  year =         "1987",
+}
+
+@Article{guyon-91,
+  author =       "I. Guyon and P. Albrecht and Y. {Le Cun} and J. S.
+                 Denker and W. Hubbard",
+  title =        "design of a neural network character recognizer for a
+                 touch termin al",
+  journal =      "Pattern Recognition",
+  volume =       "24",
+  number =       "2",
+  pages =        "105--119",
+  year =         "1991",
+}
+
+@InProceedings{Guyon92,
+  author =       "I. Guyon and V. Vapnik and B. Boser and L. Bottou and
+                 S. A. Solla",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Structural Risk Minimization for Character
+                 Recognition",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "471--479",
+  year =         "1992",
+}
+
+@InCollection{Guyon92b,
+  author =       "I. Guyon",
+  editor =       "S. Impedovo",
+  booktitle =    "From Pixels to Features III",
+  title =        "Writer independent and writer adaptive neural network
+                 for on-line character recognition",
+  publisher =    "Elsevier",
+  address =      "Amsterdam",
+  pages =        "493--506",
+  year =         "1992",
+}
+
+@InProceedings{Guyon93,
+  author =       "I. Guyon and B. Boser and V. Vapnik",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Automatic Capacity Tuning of Very Large {VC}-dimension
+                 Classifiers",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "147--155",
+  year =         "1993",
+}
+
+@InProceedings{Guyon95,
+  author =       "I. Guyon and F. Pereira",
+  booktitle =    ICDAR95,
+  title =        "Design of a linguistic postprocessor using variable
+                 memory length {Markov} models",
+  publisher =    "IEEE Computer Society Press",
+  address =      "Montreal, Canada",
+  pages =        "454--457",
+  month =        aug,
+  year =         "1995",
+}
+
+@InCollection{Guyon96,
+  author =       "I. Guyon and M. Schenkel and J. Denker",
+  editor =       "P. S. P. Wang and H. Bunke",
+  booktitle =    "Handbook on Optical Character Recognition and Document
+                 Image Analysis",
+  title =        "Overview and synthesis of on-line cursive handwriting
+                 recognition techniques",
+  publisher =    "World Scientific",
+  year =         "1996",
+}
+
+@article{Guyon+Elisseeff-2003,
+    address = {Cambridge, MA},
+    author = {Guyon, Isabelle   and Elisseeff, Andre},
+    issn = {1533-7928},
+    journal = jmlr,
+    pages = {1157--1182},
+    publisher = {MIT Press},
+    title = {An introduction to variable and feature selection},
+    volume = {3},
+    year = {2003}
+}
+    %url = {http://portal.acm.org/citation.cfm?id=944968},
+
+@book{Guyon+al-2006,
+        editor = "Isabelle Guyon and Steve Gunn and Masoud Nikravesh and Lofti Zadeh",
+        title =    "Feature Extraction, Foundations and Applications",
+        publisher =    "Springer",
+        year =         "2006",
+}
+
+
+@Article{Gyorgyi90a,
+  author =       "G. Gy{\"o}rgyi",
+  title =        "Inference of a Rule by a Neural Network with Thermal
+                 Noise",
+  journal =      prl,
+  volume =       "64",
+  pages =        "2957--2960",
+  year =         "1990",
+}
+
+@InCollection{Gyorgyi90b,
+  author =       "G. Gyorgyi and N. Tishby",
+  editor =       "W. K. Theumann and R. Koeberle",
+  booktitle =    "Neural Networks and Spin Glasses",
+  title =        "Statistical Theory of Learning a Rule",
+  publisher =    "World Scientific",
+  address =      "Singapore",
+  year =         "1990",
+}
+
+@InProceedings{ha93,
+  author =       "J. Y. Ha and S. C. Oh and J. H. Kim and Y. B. Kwon",
+  booktitle =    "Third International Workshop on Frontiers in
+                 Handwriting Recognition",
+  title =        "Unconstrained handwritten word recognition with
+                 interconnected hidden {Markov} models",
+  publisher =    "IAPR",
+  address =      "Buffalo",
+  pages =        "455--460",
+  month =        may,
+  year =         "1993",
+}
+
+@Article{haasdonk2002tdk,
+  author =       "B. Haasdonk and D. Keysers",
+  title =        "{Tangent distance kernels for support vector
+                 machines}",
+  journal =      "Proc. of the 16th ICPR",
+  volume =       "2",
+  pages =        "864--868",
+  year =         "2002",
+}
+
+@inproceedings {hadsell-chopra-lecun-06,
+original = "orig/hadsell-chopra-lecun-06.pdf",
+author = "Hadsell, Raia and Chopra, Sumit and {LeCun}, Yann",
+title = "Dimensionality Reduction by Learning an Invariant Mapping",
+booktitle = cvpr06,
+publisher = "IEEE Press",
+pages = "1735--1742",
+year = 2006
+}
+
+@inproceedings {hadsell-chopra-lecun-06-small,
+original = "orig/hadsell-chopra-lecun-06.pdf",
+author = "Hadsell, Raia and Chopra, Sumit and {LeCun}, Yann",
+title = "Dimensionality Reduction by Learning an Invariant Mapping",
+booktitle = "CVPR'2006",
+publisher = "IEEE Press",
+year = 2006
+}
+
+@inproceedings{hadsell-iros-08,
+ original = "orig/hadsell-iros-08.pdf",
+ author = "Hadsell, Raia and Erkan, Ayse and Sermanet, Pierre and Scoffier, Marco and Muller, Urs and {LeCun}, Yann",
+ title = "Deep Belief Net Learning in a Long-Range Vision System for Autonomous Off-Road Driving",
+ booktitle = "Proc. Intelligent Robots and Systems (IROS'08)",
+ pages = "628--633",
+ year = "2008",
+}
+ %url = "http://www.cs.nyu.edu/~raia/docs/iros08-farod.pdf",
+
+@TechReport{Haffner+96,
+  author =       "P. Haffner and L. Bottou and J. Bromley and C. J. C.
+                 Burges and T. Cauble and Y. {Le Cun} and C. Nohl and C.
+                 Stanton and C. Stenard and P. Vincent",
+  title =        "the {HCAR50} check amount reading system",
+  number =       "Forthcoming publication",
+  institution =  "Lucent Technologies, Bell Labs Innovation",
+  address =      "Holmdel, New-Jersey",
+  year =         "1996",
+}
+
+@InProceedings{Haffner89,
+  author =       "P. Haffner and A. Waibel and K. Shikano",
+  booktitle =    "Proceedings of Eurospeech'89",
+  title =        "Fast back-propagation learning methods for large
+                 phonemic neural networks",
+  year =         "1989",
+}
+
+@InProceedings{Haffner91,
+  author =       "P. Haffner and M. Franzini and A. Waibel",
+  booktitle =    icassp,
+  title =        "Integrating Time Alignment and Neural Networks for
+                 High Performance Continuous Speech Recognition",
+  address =      "Toronto",
+  pages =        "105--108",
+  year =         "1991",
+}
+
+@Book{HAJ90,
+  author =       "X. D. Huang and Y. Ariki and M. Jack",
+  title =        "Hidden Markov Models for Speech Recognition",
+  publisher =    "University Press",
+  address =      "Edinburgh",
+  year =         "1990",
+}
+
+@inproceedings{HagiwaraK2000,
+ title = {Regularization Learning and Early Stopping in Linear Networks},
+ author = {Hagiwara, Katsuyuki and Kuno, Kazuhiro},
+ booktitle = ijcnn,
+ year = {2000},
+ isbn = {0-7695-0619-4},
+ pages = {4511},
+ publisher = {IEEE Computer Society},
+ address = {Washington, DC, USA},
+ }
+
+@TechReport{Ham2003,
+  author =       "J. Ham and D. D. Lee and S. Mika and B.
+                 Sch{\"o}lkopf",
+  title =        "A kernel view of the dimensionality reduction of
+                 manifolds",
+  number =       "TR-110",
+  institution =  "Max Planck Institute for Biological Cybernetics",
+  address =      "Germany",
+  year =         "2003",
+}
+
+@Article{Hamilton88,
+  author =       "J. D. Hamilton",
+  title =        "Rational-Expectations Econometric Analysis of Changes
+                 in Regime",
+  journal =      "Journal of Economic Dynamics and Control",
+  volume =       "12",
+  pages =        "385--423",
+  year =         "1988",
+}
+
+@Article{hamilton89,
+  author =       "J. D. Hamilton",
+  title =        "A new approach to the economic analysis of
+                 non-stationary time series and the business cycle",
+  journal =      "Econometrica",
+  volume =       "57",
+  number =       "2",
+  pages =        "357--384",
+  month =        mar,
+  year =         "1989",
+}
+
+@Article{Hamilton90,
+  author =       "J. D. Hamilton",
+  title =        "Analysis of time series subject to changes in regime",
+  journal =      "Journal of Econometrics",
+  volume =       "45",
+  pages =        "39--70",
+  year =         "1990",
+}
+
+@InCollection{Hamilton93,
+  author =       "J. D. Hamilton",
+  editor =       "R. Engle and D. {McFadden}",
+  booktitle =    "Handbook of Econometrics",
+  title =        "State-Space Models",
+  publisher =    "North Holland, New York",
+  year =         "1993",
+}
+
+@Article{Hamilton94,
+  author =       "J. D. Hamilton and R. Susmel",
+  title =        "Autoregressive conditional heteroskedasticity and
+                 changes in regime",
+  journal =      "Journal of Econometrics",
+  volume =       "64",
+  number =       "1-2",
+  pages =        "307--33",
+  year =         "1994",
+}
+
+@Article{Hamilton96,
+  author =       "J. D. Hamilton",
+  title =        "Specification testing in Markov-switching time-series
+                 models",
+  journal =      "Journal of Econometrics",
+  volume =       "70",
+  pages =        "127--157",
+  year =         "1996",
+}
+
+@misc{Hammersley+Clifford-1971,
+ author = {John M. Hammersley and Peter Clifford}, 
+ year = 1971, 
+ title = {Markov field on finite graphs and lattices},
+ howpublished = {Unpublished manuscript}
+}
+
+@InProceedings{HammondSimoncelli07,
+  author =       "David K. Hammond and Eero P. Simoncelli",
+  booktitle =    ICIP07,
+  title =        "A Machine Learning Framework for Adaptive Combination
+                 of Signal Denoising Methods",
+  volume =       "6",
+  pages =        "29--32",
+  year =         "2007",
+}
+
+@Article{hampshire90,
+  author =       "John B. Hampshire and Alexander H. Waibel",
+  title =        "A Novel Objective Function for Improved Phoneme
+                 Recognition Using Time-Delay Neural Networks",
+  journal =      "IEEE Transactions of Neural Networks",
+  volume =       "1",
+  number =       "2",
+  pages =        "216--228",
+  month =        jun,
+  year =         "1990",
+}
+
+@InProceedings{HAMPSHIRE92A,
+  author =       "J. B. Hampshire and B. V. K. Vijaya Kumar",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Shooting Craps in Search of an Optimal Strategy for
+                 Training Connectionist Pattern Classifiers",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "1125--1132",
+  year =         "1992",
+}
+
+@InProceedings{Han96,
+  author =       "H-H. Han and H-C. Jung and Y-R. Lee and S-C. Jeong",
+  booktitle =    nipc-hmit96,
+  title =        "Application of Neural Network for {PWR} Steam
+                 Generator Water Level Control at Low Power Operation",
+  volume =       "1",
+  publisher =    ans,
+  pages =        "49--52",
+  year =         "1996",
+}
+
+@InProceedings{Hanson89,
+  author =       "S. J. Hanson and L. Pratt",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "A Comparison of Different Biases for Minimal Network
+                 Construction with Back-Propagation",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "177--185",
+  year =         "1989",
+}
+
+@Book{Hardle2004,
+  author =       "Wolfgang H{\"a}rdle and Marlene M{\"u}ller and Stefan Sperlich and Axel
+                 Werwatz",
+  title =        "Nonparametric and Semiparametric Models",
+  publisher =    "Springer",
+  address =      "http://www.xplore-stat.de/ebooks/ebooks.html",
+  year =         "2004",
+}
+
+@article{Hardoon+al-2004,
+    address = {Cambridge, MA, USA},
+    author = {Hardoon, David  R.  and Szedmak, Sandor  R.  and Shawe-Taylor, John  R. },
+    doi = {10.1162/0899766042321814},
+    issn = {0899-7667},
+    journal = {Neural Computation},
+    month = {December},
+    number = {12},
+    pages = {2639--2664},
+    publisher = {MIT Press},
+    title = {Canonical Correlation Analysis: An Overview with Application to Learning Methods},
+    url = {http://portal.acm.org/citation.cfm?id=1119696.1119703},
+    volume = {16},
+    year = {2004}
+}
+
+@InProceedings{HardoonD2007,
+  author =       "David R. Hardoon and John Shawe-Taylor and Antti
+                 Ajanki and Kai Puolamäki and Samuel Kaski",
+  booktitle =    "Proceedings of AISTATS 2007",
+  title =        "Information Retrieval by Inferring Implicit Queries
+                 from Eye Movements",
+  year =         "2007",
+}
+
+@InProceedings{Harmeling02,
+  author =       "S. Harmeling and A. Ziehe and M. Kawanabe and K.-R.
+                 M{\"u}ller",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Kernel Feature Spaces and Nonlinear Blind Souce
+                 Separation",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+  original =     "orig/AA34.ps",
+}
+
+@InProceedings{Harp90,
+  author =       "S. A. Harp and T. Samad and A. Guha",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Designing Application-Specific Neural Networks Using
+                 the Genetic Algorithm",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "447--454",
+  year =         "1990",
+}
+
+@Article{Hartman90,
+  author =       "E. J. Hartman and J. D. Keeler and J. M. Kowalski",
+  title =        "Layered Neural Networks with {G}aussian Hidden Units As
+                 Universal Approximations",
+  journal =      nc,
+  volume =       "2",
+  pages =        "210--215",
+  year =         "1990",
+}
+
+@Article{Haruno01,
+  author =       "M. Haruno and DM. Wolpert and M. Kawato",
+  title =        "{MOSAIC} model for sensorimotor learning and control",
+  journal =      "Neural Computation",
+  volume =       "13",
+  number =       "10",
+  pages =        "2201--2220",
+  year =         "2001",
+}
+
+@InProceedings{Hassibi-nips93,
+  author =       "B. Hassibi and D. G. Stork",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Second Order Derivatives for Network Pruning: Optimal
+                 Brain Surgeon",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "164--171",
+  year =         "1993",
+}
+
+@InProceedings{Hastad86,
+  author =       "Johan H{\aa}stad",
+  booktitle =    "Proceedings of the 18th annual ACM Symposium on Theory
+                 of Computing",
+  title =        "Almost optimal lower bounds for small depth circuits",
+  publisher =    "ACM Press",
+  address =      "Berkeley, California",
+  pages =        "6--20",
+  year =         "1986",
+}
+
+@Book{Hastad87,
+  author =       "Johan T. H{\aa}stad",
+  title =        "Computational Limitations for Small Depth Circuits",
+  publisher =    "{MIT} Press",
+  year =         "1987",
+}
+
+@Article{Hastad91,
+  author =       "Johan H{\aa}stad and Mikael Goldmann",
+  title =        "On the power of small-depth threshold circuits",
+  journal =      "Computational Complexity",
+  volume =       "1",
+  pages =        "113--129",
+  year =         "1991",
+}
+
+@Article{Hastie-Stuetzle-1989,
+  author =       "T. Hastie and W. Stuetzle",
+  title =        "Principal Curves",
+  journal =      "Journal of the American Statistical Association",
+  volume =       "84",
+  pages =        "502--516",
+  year =         "1989",
+}
+
+@Book{Hastie2001,
+  author =       "T. Hastie and R. Tibshirani and J. Friedman",
+  title =        "The elements of statistical learning: data mining,
+                 inference and prediction",
+  publisher =    "Springer Verlag",
+  year =         "2001",
+  series =       "Springer Series in Statistics",
+  annote =       "ISBN: 0387952845",
+}
+
+@Article{Hastie2004,
+  author =       "Trevor Hastie and Saharon Rosset and Robert Tibshirani
+                 and Ji Zhu",
+  title =        "The entire regularization path for the support vector
+                 machine",
+  journal =      jmlr,
+  volume =       "5",
+  pages =        "1391--1415",
+  year =         "2004",
+}
+
+@InProceedings{hastie96discriminant,
+  author =       "T. Hastie and R. Tibshirani",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Discriminant Adaptive Nearest Neighbor Classification
+                 and Regression",
+  volume =       "8",
+  publisher =    "{MIT} Press",
+  pages =        "409--415",
+  year =         "1996",
+  URL =          "citeseer.nj.nec.com/hastie94discriminant.html",
+}
+
+@Article{Hathaway85,
+  author =       "R. J. Hathaway",
+  title =        "A constrained formulation of Maximum-Likelihood
+                 estimation for normal mixture distributions",
+  journal =      "The Annals of Statistics",
+  volume =       "13",
+  number =       "2",
+  year =         "1985",
+}
+
+@article{hausser:2003,
+    author = {Michael Ha{\"u}sser and Bartlett Mel},
+    title = {Dendrites: Bug or Feature?},
+    journal = {Current Opinion in Neurobiology},
+    volume = {13},
+    year = {2003},
+    pages = {372-383},
+}
+
+@InProceedings{Haussler89,
+  author =       "D. Haussler",
+  booktitle =    "Proc. of the 30th Annual Symposium on the Foundations
+                 of Computer Science",
+  title =        "Generalizing the {PAC} model: sample size bounds from
+                 metric dimension-based uniform convergence results",
+  publisher =    "IEEE",
+  year =         "1989",
+}
+
+@InProceedings{haussler95,
+  author =       "D. Haussler and J. Kivinen and M. K. Warmuth",
+  booktitle =    "Computational Learning Theory, 2nd European
+                 Conference, EuroCOLT'95",
+  title =        "Sequential prediction of individual sequences under
+                 general loss functions",
+  publisher =    "Springer",
+  pages =        "69--83",
+  year =         "1995",
+}
+
+@book{hay01nnn,
+    author = {Haykin, Simon},
+    edition = {2},
+    howpublished = {Hardcover},
+    isbn = {0132733501},
+    keywords = {network, neural},
+    month = {July},
+    posted-at = {2009-07-04 21:37:33},
+    priority = {2},
+    publisher = {Prentice Hall},
+    title = {Neural Networks: A Comprehensive Foundation (2nd Edition)},
+    url = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20&path=ASIN/0132733501},
+    year = {1998}
+}
+
+
+@TechReport{He+Niyogi-2002,
+  author =       "X. He and P. Niyogi",
+  title =        "Locality Preserving Projections ({LPP})",
+  number =       "TR-2002-09",
+  institution =  "University of Chicago, Computer Science",
+  year =         "2002",
+}
+
+@incollection{He+Niyogi-2004,
+    author = "Xiaofei He and Partha Niyogi",
+    title = "Locality Preserving Projections",
+    editor = NIPS16ed,
+    booktitle = NIPS16,
+    publisher = "MIT Press",
+    address = "Cambridge, MA",
+    year = "2004",
+}
+
+@Book{Hebb49,
+  author =       "D. O. Hebb",
+  title =        "The Organization of Behavior",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1949",
+}
+
+@InProceedings{Hecht-Nielsen87a,
+  author =       "R. Hecht-Nielsen",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Combinatorial Hypercompression",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "455--461",
+  year =         "1987",
+}
+
+@Article{Hecht-Nielsen87b,
+  author =       "R. Hecht-Nielsen",
+  title =        "Counterpropagation Networks",
+  journal =      applopt,
+  volume =       "26",
+  pages =        "4979--4984",
+  year =         "1987",
+}
+
+@Article{Hecht-Nielsen88,
+  author =       "R. Hecht-Nielsen",
+  title =        "Applications of Counterpropagation Networks",
+  journal =      nn,
+  volume =       "1",
+  pages =        "131--139",
+  year =         "1988",
+}
+
+@InProceedings{Hecht-Nielsen89,
+  author =       "R. Hecht-Nielsen",
+  booktitle =    ijcnn,
+  title =        "Theory of the Backpropagation Neural Network",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "593--605",
+  year =         "1989",
+}
+
+@Article{Hecht-Nielsen-1995,
+  author =       "R. Hecht-Nielsen",
+  title =        "Replicator neural networks for universal optimal source coding",
+  journal =      "Science",
+  volume =       "269",
+  pages =        "1860-1863",
+  year =         "1995",
+}
+
+@TechReport{Heckerman96,
+  author =       "D. Heckerman",
+  title =        "A tutorial on learning with {Bayesian} networks",
+  number =       "TR-95-06",
+  institution =  "Microsoft Research",
+  address =      "ftp://ftp.research.microsoft.com/pub/Tech-Reports/Winter94-95/TR-95-06.PS",
+  month =        jan,
+  year =         "1996",
+}
+
+@article{HeckermanD2000,
+    author = {David Heckerman and David Maxwell Chickering and Christopher Meek and Robert Rounthwaite and Carl Kadie},
+    title = {Dependency networks for inference, collaborative filtering, and data visualization},
+    journal = jmlr,
+    year = {2000},
+    volume = {1},
+    pages = {49--75}
+}
+
+@article{heeger:1992a,
+    author={David J. Heeger},
+    title ={Normalization of Cell Responses in Cat Striate Cortex},
+    journal ={Visual Neuroscience},
+    volume={9},
+    number={2},
+    pages={181-198},
+    year={1992},
+}
+
+@InProceedings{Hegde88,
+  author =       "S. U. Hegde and J. L. Sweet and W. B. Levy",
+  booktitle =    icnn,
+  title =        "Determination of Parameters in a Hopfield/Tank
+                 Computational Network",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "291--298",
+  year =         "1988",
+}
+
+@article{HedgeJ2000,
+	address = {Department of Anatomy and Neurobiology, Washington University School of Medicine, St. Louis, Missouri 63110, USA.},
+	author = {Jay Hegd\'{e} and David C. {Van Essen} },
+	citeulike-article-id = {465720},
+	issn = {1529-2401},
+	journal = {Journal of Neuroscience},
+	keywords = {contour, v2},
+	month = {March},
+	number = {5},
+	posted-at = {2006-01-15 12:57:15},
+	priority = {0},
+	title = {Selectivity for complex shapes in primate visual area V2},
+	volume = {20},
+	year = {2000}
+}
+	%url = {http://view.ncbi.nlm.nih.gov/pubmed/10684908},
+
+@inproceedings{Heitz+al:NIPS08a,
+  title = {Cascaded Classification Models: {C}ombining Models for Holistic Scene Understanding},
+  author = {G. Heitz and S. Gould and A. Saxena and D. Koller},
+  booktitle =    "Advances in Neural Information Processing Systems (NIPS 2008)",
+  year = 2008,
+}
+
+@InProceedings{HeldM1998,
+  author =       "Marcus Held and Joachim M. Buhmann",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "Unsupervised on-line learning of decision trees for
+                 hierarchical data analysis",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA, USA",
+  pages =        "514--520",
+  year =         "1998",
+  ISBN =         "0-262-10076-2",
+  location =     "Denver, Colorado, United States",
+}
+
+@InProceedings{herlocker99,
+  author =       "Jonathan L. Herlocker and Joseph A. Konstan and Al
+                 Borchers and John Riedl",
+  booktitle =    "SIGIR '99: Proceedings of the 22nd annual
+                 international ACM SIGIR conference on Research and
+                 development in information retrieval",
+  title =        "An algorithmic framework for performing collaborative
+                 filtering",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "230--237",
+  year =         "1999",
+  location =     "Berkeley, California, United States",
+}
+
+@InProceedings{Hermansky-genova91,
+  author =       "Hynek Hermansky and Nelson Morgan and Aruna Bayya and
+                 Phil Kohn",
+  booktitle =    "Proc. of Eurospeech 91",
+  title =        "Compensation for the Effect of the Communication
+                 Channel in Auditory-like Analysis of Speech
+                 ({RASTA}-{PLP})",
+  address =      "Genova (Italy)",
+  pages =        "1367--1371",
+  year =         "1991",
+}
+
+@TechReport{Hermansky-icsi91,
+  author =       "Hynek Hermansky and Nelson Morgan and Aruna Bayya and
+                 Phil Kohn",
+  title =        "{RASTA}-{PLP} Speech Analysis",
+  number =       "TR-91-069",
+  institution =  "International Computer Science Institute",
+  address =      "Berkeley, CA",
+  month =        dec,
+  year =         "1991",
+  OPTnote =      "Most speech parameter estimation techniques are easily
+                 influenced by the frequency response of the
+                 communication channel. We have developed a technique
+                 that is more robust to such steady-state spectral
+                 factors in speech. The approach is conceptually simple
+                 and computationally efficient. The new method is
+                 described, and experimental results are reported,
+                 showing a significant advantage for the proposed
+                 method.",
+}
+
+@Article{Hermansky-jasa90,
+  author =       "Hynek Hermansky",
+  title =        "Perceptual Linear Predictive ({PLP}) Analysis for
+                 Speech",
+  journal =      jasa,
+  year =         "1990",
+  OPTnote =      "",
+  OPTpages =     "1738--1752",
+}
+
+@Book{Hernandez-Lerma+Lasserre-2003,
+  author =       "On\'esimo Hern\'andez-Lerma and Jean Bernard
+                 Lasserre",
+  title =        "Markov Chains and Invariant Probabilities",
+  publisher =    "Birkh{\"a}user Verlag",
+  year =         "2003",
+}
+
+@InProceedings{Hertz86,
+  author =       "J. A. Hertz and G. Grinstein and S. Solla",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "Memory Networks with Asymmetric Bonds",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "212--218",
+  year =         "1986",
+}
+
+@InProceedings{Hertz87,
+  author =       "J. A. Hertz and G. Grinstein and S. Solla",
+  editor =       "J. L. van Hemmen and I. Morgenstern",
+  booktitle =    "Heidelberg Colloquium on Glassy Dynamics",
+  title =        "Irreversible Spin Glasses and Neural Networks",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Heidelberg 1986",
+  pages =        "538--546",
+  year =         "1987",
+}
+
+@Article{Hertz89a,
+  author =       "J. A. Hertz",
+  title =        "A Gauge Theory in Computational Vision: {A} Model for
+                 Outline Extraction",
+  journal =      pscrip,
+  volume =       "39",
+  pages =        "161--167",
+  year =         "1989",
+}
+
+@Article{Hertz89b,
+  author =       "J. A. Hertz and A. Krogh and G. I. Thorbergsson",
+  title =        "Phase Transitions in Simple Learning",
+  journal =      jpa,
+  volume =       "22",
+  pages =        "2133--2150",
+  year =         "1989",
+}
+
+@TechReport{Hertz90,
+  author =       "J. A. Hertz",
+  title =        "Statistical Dynamics of Learning",
+  type =         "Preprint",
+  number =       "90/34 S",
+  institution =  "Nordita",
+  address =      "Copenhagen, Denmark",
+  year =         "1990",
+}
+
+@Article{Herz89,
+  author =       "A. Herz and B. Sulzer and R. K{\"u}hn and J. L. van
+                 Hemmen",
+  title =        "Hebbian Learning Reconsidered: Representation of
+                 Static and Dynamic Objects in Associative Neural Nets",
+  journal =      biocyb,
+  volume =       "60",
+  pages =        "457--467",
+  year =         "1989",
+}
+
+@Article{Heskes-98,
+  author =       "T. Heskes",
+  title =        "Bias/variance decompositions for likelihood-based
+                 estimators",
+  journal =      "Neural Computation",
+  volume =       "10",
+  pages =        "1425--1433",
+  year =         "1998",
+}
+
+@Article{heskes00,
+  author =       "Tom Heskes",
+  title =        "On Natural Learning and Pruning in Multilayered
+                 Perceptrons",
+  journal =      "Neural Computation",
+  volume =       "12",
+  number =       "4",
+  pages =        "881--901",
+  year =         "2000",
+}
+
+@InProceedings{heskes98,
+  author =       "Tom Heskes",
+  booktitle =    "International Conference On Machine Learning",
+  title =        "Solving a huge number of similar tasks: a combination
+                 of multi-task learning and a hierarchical {Bayesian}
+                 approach",
+  year =         "1998",
+}
+
+@Article{Hestenes+Stiefel-1952,
+  author =       "Magnus R. Hestenes and Eduard Stiefel",
+  title =        "Methods of Conjugate Gradients for Solving Linear
+                 Systems",
+  journal =      "Journal of Research of National Bureau Standards",
+  volume =       "49",
+  number =       "6",
+  pages =        "409--436",
+  year =         "1952",
+}
+
+@Article{Hettich-93,
+  author =       "R. Hettich and K. O. Kortanek",
+  title =        "Semi-infinite programming: theory, methods, and
+                 applications",
+  journal =      "{SIAM} Review",
+  volume =       "35",
+  number =       "3",
+  pages =        "380--429",
+  year =         "1993",
+}
+
+@InProceedings{Hines96,
+  author =       "J. W. Hines",
+  booktitle =    nipc-hmit96,
+  title =        "A Logarithmic Neural Network Architecture for a {PRA}
+                 Approximation",
+  volume =       "1",
+  publisher =    ans,
+  pages =        "235--241",
+  year =         "1996",
+}
+
+@Article{HinOsiWel2006,
+  author =       "Geoffrey E. Hinton and Simon Osindero and Max Welling
+                 and {Yee Whye} Teh",
+  title =        "Unsupervised Discovery of Non-Linear Structure using
+                 Contrastive Backpropagation",
+  journal =      "Cognitive Science",
+  volume =       "30",
+  number =       "4",
+  year =         "2006",
+}
+
+@Article{Hinton+Ghahramani-97,
+  author =       "G. E. Hinton and Z. Ghahramani",
+  title =        "Generative models for discovering sparse distributed
+                 representations",
+  journal =      "Philosophical Transactions of the Royal Society of
+                 London",
+  volume =       "B",
+  number =       "352",
+  pages =        "1177--1190",
+  year =         "1997",
+}
+
+@InCollection{Hinton-bo86,
+  author =       "G. E. Hinton and T. J. Sejnowski",
+  editor =       "D. E. Rumelhart and J. L. McClelland",
+  booktitle =    "Parallel Distributed Processing: Explorations in the
+                 Microstructure of Cognition. Volume 1: Foundations",
+  title =        "Learning and relearning in {Boltzmann} machines",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "282--317",
+  year =         "1986",
+}
+
+@InProceedings{Hinton-ICA-2001,
+  author =       "G. E. Hinton and M. Welling and Y. W. Teh and S.
+                 Osindero",
+  booktitle =    "Proceedings of 3rd International Conference on Independent Component Analysis and Blind Signal Separation (ICA'01)",
+  title =        "A New View of {ICA}",
+  address =      "San Diego, CA",
+  pages =        "746--751",
+  year =         "2001",
+}
+
+@InProceedings{Hinton-nips95,
+  author =       "G. E. Hinton and M. Revow and P. Dayan",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        "Recognizing handwritten digits using mixtures of
+                 linear models",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "1015--1022",
+  year =         "1995",
+}
+
+@TechReport{Hinton-PoE-2000,
+  author =       "Geoffrey E. Hinton",
+  title =        "Training Products of Experts by Minimizing Contrastive
+                 Divergence",
+  number =       "GCNU TR 2000-004",
+  institution =  "Gatsby Unit, University College London",
+  year =         "2000",
+}
+
+@Article{Hinton-Science2006,
+  author =       "Geoffrey E. Hinton and Ruslan Salakhutdinov",
+  title =        "Reducing the dimensionality of data with neural
+                 networks",
+  journal =      "Science",
+  volume =       "313",
+  number =       "5786",
+  pages =        "504--507",
+  month =        jul,
+  year =         "2006",
+}
+
+%I deprecate the following one as this is a duplicate of the preceding one!
+@Article{Hinton+Salakhutdinov-2006,
+  author =       "Geoffrey E. Hinton and Ruslan {Salakhutdinov}",
+  title =        "{Reducing the Dimensionality of Data with Neural
+                 Networks}",
+  journal =      "Science",
+  volume =       "313",
+  pages =        "504--507",
+  month =        jul,
+  year =         "2006",
+}
+
+
+@Article{Hinton06,
+  author =       "Goeffrey E. Hinton and Simon Osindero and {Yee Whye} Teh",
+  title =        "A fast learning algorithm for deep belief nets",
+  journal =      "Neural Computation",
+  volume =       "18",
+  pages =        "1527--1554",
+  year =         "2006",
+
+}
+
+@Article{Hinton06-small,
+  author =       "G. E. Hinton and S. Osindero and Y.-W. Teh",
+  title =        "A fast learning algorithm for deep belief nets",
+  journal =      "Neural Computation",
+  volume =       "18",
+  pages =        "1527--1554",
+  year =         "2006",
+
+}
+
+@InProceedings{hinton1994amd,
+  author =       "Geoffrey E. Hinton and R. S. Zemel",
+  title =        "Autoencoders, minimum description length, and
+                 Helmholtz free energy",
+  booktitle =    NIPS6,
+  editor =       NIPS6ed,
+  publisher =    "Morgan Kaufmann Publishers, Inc.",
+  pages =        "3--10",
+  year =         "1994",
+}
+
+@Article{Hinton2002,
+  author =       "Geoffrey E. Hinton",
+  title =        "Training products of experts by minimizing contrastive
+                 divergence",
+  journal =      "Neural Computation",
+  volume =       "14",
+  pages =        "1771--1800",
+  year =         "2002",
+}
+
+@InProceedings{Hinton83,
+  author =       "G. E. Hinton and T. J. Sejnowski",
+  booktitle =    cvpr83,
+  title =        "Optimal Perceptual Inference",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1983",
+  pages =        "448--453",
+  year =         "1983",
+}
+
+@TechReport{Hinton84,
+  author =       "G. E. Hinton and T. J. Sejnowski and D. H. Ackley",
+  title =        "{Boltzmann} machines: Constraint satisfaction networks
+                 that learn",
+  number =       "TR-CMU-CS-84-119",
+  institution =  "Carnegie-Mellon University, Dept. of Computer
+                 Science",
+  year =         "1984",
+}
+
+@InCollection{Hinton86a,
+  author =       "G. E. Hinton and T. J. Sejnowski",
+  editor =       "D. E. Rumelhart and J. L. McClelland",
+  booktitle =    pdp,
+  title =        "Learning and Relearning in {Boltzmann} Machines",
+  chapter =      "7",
+  volume =       "1",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  pages =        "282--317",
+  year =         "1986",
+}
+
+@InProceedings{Hinton86b,
+  author =       "Geoffrey E. Hinton",
+  booktitle =    "Proceedings of the Eighth Annual Conference of the
+                 Cognitive Science Society",
+  title =        "Learning Distributed Representations of Concepts",
+  publisher =    "Lawrence Erlbaum, Hillsdale",
+  address =      "Amherst 1986",
+  pages =        "1--12",
+  year =         "1986",
+}
+
+@InProceedings{Hinton86b-small,
+  author =       "Geoffrey E. Hinton",
+  booktitle =    "Proc. 8th Annual Conf. Cog. Sc. Society",
+  title =        "Learning Distributed Representations of Concepts",
+  pages =        "1--12",
+  year =         "1986",
+}
+
+@InProceedings{Hinton87,
+  author =       "Geoffrey E. Hinton",
+  editor =       "J. W. {de Bakker} and A. J. Nijman and P. C.
+                 Treleaven",
+  booktitle =    "Proceedings of {PARLE} Conference on Parallel
+                 Architectures and Languages Europe",
+  title =        "Learning translation invariant in massively parallel
+                 networks",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  pages =        "1--13",
+  year =         "1987",
+}
+
+@Article{Hinton89,
+  author =       "Geoffrey E. Hinton",
+  title =        "Deterministic {Boltzmann} Learning Performs Steepest
+                 Descent in Weight Space",
+  journal =      nc,
+  volume =       "1",
+  pages =        "143--150",
+  year =         "1989",
+}
+
+@Article{Hinton89b,
+  author =       "Geoffrey E. Hinton",
+  title =        "Connectionist learning procedures",
+  journal =      "Artificial Intelligence",
+  volume =       "40",
+  pages =        "185--234",
+  year =         "1989",
+}
+
+@Article{Hinton90,
+  author =       "G. E. Hinton and S. J. Nowlan",
+  title =        "The bootstrap Widrow-Hoff rule as a cluster-formation
+                 algorithm",
+  journal =      nc,
+  volume =       "2",
+  pages =        "355--362",
+  year =         "1990",
+}
+
+@InProceedings{Hinton92,
+  author =       "G. E. Hinton and C. K. I. Williams and M. D. Revow",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Adaptive elastic models for hand-printed character
+                 recognition",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "512--519",
+  year =         "1992",
+}
+
+@Misc{Hinton93,
+  author =       "Geoffrey E. Hinton",
+  title =        "Using the minimum description length principle to
+                 discover factorial codes",
+  howpublished = "Lecture given at the 1993 Connectionist Models Summer
+                 School",
+  year =         "1993",
+}
+
+@Article{Hinton95,
+  author =       "Geoffrey E. Hinton and Peter Dayan and Brendan J. Frey and Radford M.
+                 Neal",
+  title =        "The wake-sleep algorithm for unsupervised neural
+                 networks",
+  journal =      "Science",
+  volume =       "268",
+  pages =        "1558--1161",
+  year =         "1995",
+}
+
+@Article{hinton97modelling,
+  author =       "G. E. Hinton and P. Dayan and M. Revow",
+  title =        "Modelling the manifolds of images of handwritten
+                 digits",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "8",
+  pages =        "65--74",
+  year =         "1997",
+}
+
+@InProceedings{Hinton99,
+  author =       "Geoffrey E. Hinton",
+  booktitle =    "Proceedings of the Ninth International Conference on
+                 Artificial Neural Networks (ICANN)",
+  title =        "Products of Experts",
+  volume =       "1",
+  publisher =    "IEE",
+  address =      "Edinburgh, Scotland",
+  pages =        "1--6",
+  year =         "1999",
+}
+
+@InProceedings{HintonG2005,
+  author =       "Geoffrey E. Hinton and Simon Osindero and Kejie Bao",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "Learning Causally Linked Markov Random Fields",
+  publisher =    "Society for Artificial Intelligence and Statistics",
+  pages =        "128--135",
+  year =         "2005",
+}
+
+@InProceedings{HintonG2005-small,
+  author =       "Geoffrey E. Hinton and Simon Osindero and Kejie Bao",
+  booktitle =    "Proceedings of AISTATS 2005",
+  title =        "Learning Causally Linked Markov Random Fields",
+  year =         "2005",
+}
+
+@TechReport{HintonG2006,
+  author =       "Geoffrey E. Hinton",
+  title =        "To recognize shapes, first learn to generate images",
+  number =       "UTML TR 2006-003",
+  institution =  "University of Toronto",
+  year =         "2006",
+}
+
+@InCollection{HintonG2007,
+  author =       "Geoffrey E. Hinton",
+  editor =       "Paul Cisek and Trevor Drew and John Kalaska",
+  booktitle =    "Computational Neuroscience: Theoretical Insights into
+                 Brain Function",
+  title =        "To recognize shapes, first learn to generate images",
+  publisher =    "Elsevier",
+  year =         "2007",
+}
+
+@TechReport{Hinton-Boltzmann,
+  author =       "G. E. Hinton and T. J. Sejnowski and D. H. Ackley",
+  title =        "{Boltzmann} machines: Constraint satisfaction networks
+                 that learn",
+  number =       "TR-CMU-CS-84-119",
+  institution =  "Carnegie-Mellon University, Dept. of Computer
+                 Science",
+  year =         "1984",
+  OPTnote =      "",
+}
+
+@InProceedings{Hirayama-nips92,
+  author =       "M. Yirayama and E. Vatikiotis-Bateson and M. Kawato
+                 and M. I. Jordan",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Forward Dynamics Modeling of Speech Motor Control
+                 Using Physiological Data",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "191--198",
+  year =         "1992",
+  OPTnote =      "",
+}
+
+@Article{Hjort96,
+  author =       "N. L. Hjort and M. C. Jones",
+  title =        "Locally parametric nonparametric density estimation",
+  journal =      "Annals of Statistics",
+  volume =       "24",
+  number =       "4",
+  pages =        "1619--1647",
+  year =         "1996",
+}
+
+@InProceedings{Ho95,
+  author =       "Tin Kam Ho",
+  booktitle =    ICDAR95,
+  title =        "Random Decision Forest",
+  address =      "Montreal, Canada",
+  pages =        "278--282",
+  year =         "1995",
+}
+
+@Misc{Hochreiter91,
+  author =       "S. Hochreiter",
+  title =        "{ Untersuchungen zu dynamischen neuronalen Netzen.
+                 Diploma thesis, Institut f\"{u}r Informatik, Lehrstuhl
+                 Prof. Brauer, Technische Universit\"{a}t M\"{u}nchen}",
+  year =         "1991",
+  url =         "http://www7.informatik.tu-muenchen.de/~Ehochreit",
+}
+
+@Article{Hoerl+Kennard70,
+  author =       "A. Hoerl and R. Kennard",
+  title =        "Ridge regression: biased estimation for non-orthogonal
+                 problems",
+  journal =      "Technometrics",
+  volume =       "12",
+  pages =        "55--67",
+  year =         "1970",
+}
+
+@inproceedings{Hoff-2008,
+ author = {H.D. Hoff},
+ title = {Modeling homophily and stochastic equivalence in symmetric relational data},
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "657--664",
+  year =         "2008",
+}
+
+@InProceedings{Holger-icpr96,
+  author =       "H. Schwenk and M. Milgram",
+  booktitle =    icpr,
+  title =        "Constraint Tangent Distance For On-Line Character
+                 Recognition",
+  pages =        "520--524",
+  year =         "1996",
+}
+
+@InProceedings{Holger-nips96,
+  author =       "H. Schwenk and M. Milgram",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        "Transformation invariant autoassociation with
+                 application to handwritten character recognition",
+  publisher =    "MIT Press",
+  pages =        "991--998",
+  year =         "1995",
+}
+
+@Book{Holland75,
+  author =       "J. H. Holland",
+  key =          "Holland",
+  title =        "Adaptation in Natural and Artificial Systems",
+  publisher =    "University of Michigan Press",
+  year =         "1975",
+}
+
+@Article{Holley+Karplus89,
+  author =       "L. H. Holley and M. Karplus",
+  title =        "Protein secondary structure prediction with a neural
+                 network",
+  journal =      PNAS,
+  volume =       "86",
+  pages =        "152--156",
+  year =         "1989",
+}
+
+@InCollection{HolTre93,
+  author =       "J. Hollatz and V. Tresp",
+  editor =       "I. Aleksander and J. Taylor",
+  booktitle =    "Artificial Neural Networks II",
+  title =        "A rule-based network architecture",
+  publisher =    "Elsevier",
+  address =      "Amsterdam",
+  year =         "1992",
+}
+
+@TechReport{HolTreAhm92,
+  author =       "J. Hollatz and V. Tresp and S. Ahmad",
+  title =        "Network structuring and training using rule-based
+                 knowledge",
+  type =         "Technical Report",
+  institution =  "Siemens AG",
+  address =      "M{\"u}nchen, Germany",
+  year =         "1992",
+}
+
+@InProceedings{HolubA2005,
+  author =       "Alex Holub and Pietro Perona",
+  booktitle =    cvpr05,
+  title =        "A Discriminative Framework for Modelling Object
+                 Classes",
+  publisher =    "IEEE Computer Society",
+  address =      "Washington, DC, USA",
+  pages =        "664--671",
+  year =         "2005",
+  ISBN =         "0-7695-2372-2",
+  doi =          "http://dx.doi.org/10.1109/CVPR.2005.25",
+}
+
+@InCollection{HonglakL2009,
+  author =       "Honglak Lee and Roger Grosse and Rajesh Ranganath and Andrew Y. Ng",
+  booktitle =    ICML09,
+  editor =       ICML09ed,
+  publisher =    ICML09publ,
+  title =        "Convolutional deep belief networks for scalable unsupervised 
+		 		 learning of  hierarchical representations",
+  address =      "Montreal (Qc), Canada",
+  year =         "2009",
+}
+
+@InCollection{HonglakL2008,
+  author =       "Honglak Lee and Chaitanya Ekanadham and Andrew Ng",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Sparse deep belief net model for visual area {V2}",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "873--880",
+  year =         "2008",
+}
+
+@incollection{HonglakLNIPS2009,
+ title = {Unsupervised feature learning for audio classification using convolutional deep belief networks},
+ author = {Honglak Lee and Peter Pham and Yan Largman and Andrew Ng},
+ booktitle = NIPS22,
+ editor = NIPS22ed,
+ pages = {1096--1104},
+ year = {2009}
+}
+
+@Book{Hopcroft79,
+  author =       "J. E. Hopcroft and J. D. Ullman",
+  title =        "Introduction to Automata Theory, Languages, and
+                 Computation",
+  publisher =    "Addison-Wesley Publishing Company, Inc.",
+  address =      "Reading, MA",
+  year =         "1979",
+}
+
+@Article{Hopfield82,
+  author =       "John J. Hopfield",
+  title =        "Neural Networks and Physical Systems with Emergent
+                 Collective Computational Abilities",
+  journal =      PNAS,
+  volume =       "79",
+  year =         "1982",
+}
+
+@Article{Hopfield83,
+  author =       "J. J. Hopfield and D. I. Feinstein and R. G. Palmer",
+  title =        "``Unlearning'' Has a Stabilizing Effect in Collective
+                 Memories",
+  journal =      nature,
+  volume =       "304",
+  pages =        "158--159",
+  year =         "1983",
+}
+
+@Article{Hopfield84,
+  author =       "J. J. Hopfield",
+  title =        "Neurons with Graded Responses Have Collective
+                 Computational Properties Like Those of Two-State
+                 Neurons",
+  journal =      PNAS,
+  volume =       "81",
+  year =         "1984",
+}
+
+@Article{Hopfield85,
+  author =       "J. J. Hopfield and D. W. Tank",
+  title =        "``Neural'' Computation of Decisions in Optimization
+                 Problems",
+  journal =      biocyb,
+  volume =       "52",
+  pages =        "141--152",
+  year =         "1985",
+}
+
+@Article{Hopfield86,
+  author =       "J. J. Hopfield and D. W. Tank",
+  title =        "Computing with Neural Circuits: {A} Model",
+  journal =      science,
+  volume =       "233",
+  pages =        "625--633",
+  year =         "1986",
+}
+
+@Article{Hopfield87,
+  author =       "J. J. Hopfield",
+  title =        "Learning Algorithms and Probability Distributions in
+                 Feed-Forward and Feed-Back Networks",
+  journal =      PNAS,
+  volume =       "84",
+  pages =        "8429--8433",
+  year =         "1987",
+}
+
+@InCollection{Hopfield89,
+  author =       "J. J. Hopfield and D. W. Tank",
+  editor =       "J. H. Byrne and W. O. Berry",
+  booktitle =    "Neural Models of Plasticity",
+  title =        "Neural Architecture and Biophysics for Sequence
+                 Recognition",
+  publisher =    "Academic Press",
+  address =      "San Diego",
+  pages =        "363--377",
+  year =         "1989",
+}
+
+@Article{Hornik89,
+  author =       "Kurt Hornik and Maxwell Stinchcombe and Halbert White",
+  title =        "Multilayer Feedforward Networks Are Universal
+                 Approximators",
+  journal =      nn,
+  volume =       "2",
+  pages =        "359--366",
+  year =         "1989",
+}
+
+@Article{Hotelling1933,
+  author =       "H. Hotelling",
+  title =        "Analysis of a Complex of Statistical Variables into
+                 Principal Components",
+  journal =      "Journal of Educational Psychology",
+  volume =       "24",
+  pages =        "417--441, 498--520",
+  year =         "1933",
+}
+
+@article{Hotelling-1936,
+    author = {H. Hotelling},
+    title = {Relations between two sets of variates},
+    journal = {Biometrika},
+    volume = 28,
+    pages = {321--377},
+    year = 1936,
+}
+
+@TechReport{Houde91,
+  author =       "J. F. Houde",
+  title =        "Recursive estimation of articulatory control",
+  type =         "Computational Cognitive Science",
+  number =       "TR",
+  institution =  "MIT",
+  address =      "Cambridge, MA",
+  year =         "1991",
+}
+
+@InProceedings{Howlett+Lawrence-1995a,
+  author =       "R. J. Howlett and D. H. Lawrence",
+  booktitle =    "World Transputer Congress~'95",
+  title =        "The Class-Distributed Neural Network",
+  address =      "Harrogate, UK",
+  year =         "1995",
+}
+
+@InProceedings{Howlett+Lawrence-1995b,
+  author =       "R. J. Howlett and D. H. Lawrence",
+  booktitle =    "Proceedings of the IEEE International Conference on
+                 Neural Networks",
+  title =        "A Multi-Computer Neural Network Applied to
+                 Machine-Vision",
+  volume =       "2",
+  address =      "Perth, Australia",
+  pages =        "1150--1153",
+  year =         "1995",
+}
+
+@InProceedings{Hsu88,
+  author =       "K. Hsu and D. Brady and D. Psaltis",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Experimental Demonstration of Optical Neural
+                 Computers",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "377--386",
+  year =         "1988",
+}
+
+@Article{huang04dynamic,
+  author =       "X. Huang and F. Peng and A. An and D. Schuurmans",
+  title =        "Dynamic web log session identification with
+                 statistical language models",
+  journal =      "Journal of the American Society for Information
+                 Science and Technology",
+  volume =       "55",
+  number =       "14",
+  pages =        "1290--1303",
+  year =         "2004",
+}
+
+@Book{Huang87,
+  author =       "K. Huang",
+  title =        "Statistical Mechanics",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1987",
+}
+
+@InProceedings{Huang88,
+  author =       "W. Y. Huang and R. P. Lippmann",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Neural Net and Traditional Classifiers",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "387--396",
+  year =         "1988",
+}
+
+@TechReport{Huang89,
+  author =       "X. D. Huang and H. W. Hon and K. F. Lee",
+  title =        "Multiple Codebook Semi-Continuous Hidden {Markov}
+                 Models for Speaker-Independent Continuous Speech
+                 Recognition",
+  number =       "CMU-CS-89-136",
+  institution =  "School of Computer Science Carnegie-Mellon
+                 University",
+  address =      "Pittburgh, Pensylvania",
+  month =        apr,
+  year =         "1989",
+}
+
+@InProceedings{Huang90,
+  author =       "Xuedong Huang and Kai-Fu Lee and Hsiao-Wuen Hon",
+  booktitle =    icassp,
+  title =        "On Semi-Continuous Hidden {Markov} Modeling",
+  pages =        "689--692",
+  year =         "1990",
+}
+
+@article{Hubel+Wiesel-1959,
+    title = {Receptive Fields of Single Neurons in the Cat's Striate Cortex},
+    author = {David H. Hubel and Torsten N. Wiesel},
+    journal = {Journal of Physiology},
+    pages = {574--591},
+    volume = {148},
+    year = {1959},
+    biburl = {http://www.bibsonomy.org/bibtex/202c5cf1ee910eadba5efa77b3cd043f6/idsia},
+}
+
+@Article{Hubel62,
+  author =       "D. H. Hubel and T. N. Wiesel",
+  title =        "Receptive Fields, Binocular Interaction, and Functional Architecture in the Cat's Visual Cortex",
+  journal =      jphysiol,
+  volume =       "160",
+  pages =        "106--154",
+  year =         "1962",
+}
+
+@article{Hubel+Wiesel-1968,
+ author = {D.H. Hubel and T.N. Wiesel},
+ title = {Receptive fields and functional architecture of monkey striate cortex},
+ journal = jphysiol,
+ volume = 195,
+ pages = {215--243},
+ year = 1968,
+}
+
+@article{Huber-1985,
+    author = {Huber, Peter  J. },
+    comment = {Projection Pursuit},
+    journal = {The Annals of Statistics},
+    number = {2},
+    pages = {435--475},
+    title = {Projection Pursuit},
+    url = {http://www.jstor.org/stable/2241175},
+    volume = {13},
+    year = {1985}
+}
+
+@InProceedings{Hueter88,
+  author =       "G. J. Hueter",
+  booktitle =    icnn,
+  title =        "Solution of the Travelling Salesman Problem with an
+                 Adaptive Ring",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "85--92",
+  year =         "1988",
+}
+
+@InProceedings{Hush88,
+  author =       "D. R. Hush and J. M. Salas",
+  booktitle =    icnn,
+  title =        "Improving the Learning Rate of Back-Propagation with
+                 the Gradient Reuse Algorithm",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "441--447",
+  year =         "1988",
+}
+
+@Article{Hush92,
+  author =       "D. R. Hush and B. Horne and J. M. Solas",
+  title =        "Error Surfaces for Multilayer Perceptrons",
+  journal =      ieeesmc,
+  volume =       "22",
+  number =       "5",
+  pages =        "1152--1161",
+  month =        sep,
+  year =         "1992",
+}
+
+@InCollection{Hutchins+Hazlehurst-02,
+  author =       "Edwin Hutchins and Brian Hazlehurst",
+  editor =       "A. Cangelosi and D. Parisi",
+  booktitle =    "Simulating the Evolution of Language",
+  title =        "Auto-organization and Emergence of Shared Language
+                 Structure",
+  publisher =    "London: Springer-Verlag",
+  pages =        "279--305",
+  year =         "2002",
+}
+
+@InCollection{Hutchins+Hazlehurst-95,
+  author =       "Edwin Hutchins and Brian Hazlehurst",
+  editor =       "N. Gilbert and R. Conte",
+  booktitle =    "Artificial Societies: the computer simulation of
+                 social life",
+  title =        "How to invent a lexicon: the development of shared
+                 symbols in interaction",
+  publisher =    "London: UCL Press",
+  pages =        "157--189",
+  year =         "1995",
+}
+
+@Article{Hutchinson94,
+  author =       "J. M. Hutchinson and A. W. Lo and T. Poggio",
+  title =        "{A Nonparametric Approach to Pricing and Hedging
+                 Derivative Securities Via Learning Networks}",
+  journal =      "Journal of Finance",
+  volume =       "49",
+  number =       "3",
+  pages =        "851--889",
+  year =         "1994",
+}
+
+@Book{Hutter2005,
+  author =       "Marcus Hutter",
+  title =        "Universal Artificial Intelligence: Sequential
+                 Decisions based on Algorithmic Probability",
+  publisher =    "Springer, Berlin",
+  year =         "2005",
+}
+
+@Article{Hwang+al-1992,
+  author =       "Frank K. Hwang and Dana Richards and Pawel Winter",
+  title =        "The {Steiner} Tree Problem",
+  journal =      "Annals of Discrete Mathematics",
+  volume =       "53",
+  publisher =    "Elsevier",
+  address =      "Amsterdam",
+  year =         "1992",
+}
+
+@article{Hyvarinen-1999,
+    author = {Hyv\"arinen, A. },
+    journal = {Neural Computing Surveys},
+    keywords = {ica, separation, waspaa07bib},
+    pages = {94--128},
+    title = {Survey on Independent Component Analysis},
+    url = {http://citeseer.ist.psu.edu/223687.html},
+    volume = {2},
+    year = {1999}
+}
+
+@book{Hyvarinen-2001,
+    author = {Hyv{\"{a}}rinen, Aapo   and Karhunen, Juha   and Oja, Erkki  },
+    howpublished = {Hardcover},
+    isbn = {047140540X},
+    month = {May},
+    posted-at = {2008-07-02 02:13:00},
+    priority = {2},
+    publisher = {Wiley-Interscience},
+    title = {Independent Component Analysis},
+    url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/047140540X},
+    year = {2001}
+}
+
+@Article{Hyvarinen+al-01,
+  author =       "Aapo Hyv{\"{a}}rinen and Patrik O. Hoyer and Mika
+                 Inki",
+  title =        "Topographic Independent Component Analysis",
+  journal =      "Neural Computation",
+  volume =       "13",
+  number =       "7",
+  pages =        "1527--1558",
+  year =         "2001",
+}
+
+@Article{HyvarinenA2001,
+  author =       "Aapo Hyv{\"{a}}rinen and Patrik O. Hoyer and Mika O.
+                 Inki",
+  title =        "Topographic Independent Component Analysis",
+  journal =      "Neural Computation",
+  volume =       "13",
+  number =       "7",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA, USA",
+  pages =        "1527--1558",
+  year =         "2001",
+  ISSN =         "0899-7667",
+}
+
+@Article{HyvarinenA2001-small,
+  author =       "Aapo Hyv{\"{a}}rinen and Patrick O. Hoyer and Mika O. Inki",
+  title =        "Topographic Independent Component Analysis",
+  journal =      "Neural Computation",
+  volume =       "13",
+  number =       "7",
+  pages =        "1527--1558",
+  year =         "2001",
+}
+
+@Article{Hyvarinen-2005,
+  author =       "Aapo Hyv{\"{a}}rinen ",
+  title =        "Estimation of non-normalized statistical models using score matching",
+  journal =      jmlr,
+  volume =       "6",
+  pages =        "695--709",
+  year =         "2005",
+}
+
+@Article{Hyvarinen-2007,
+  author =       "Aapo Hyv{\"{a}}rinen ",
+  title =        "Some extensions of score matching",
+  journal =      "Computational Statistics and Data Analysis",
+  volume =       "51",
+  pages =        "2499--2512",
+  year =         "2007",
+}
+
+@Article{Hyvarinen-2007b,
+  author =       "Aapo Hyv{\"{a}}rinen ",
+  title =        "Connections between score matching, contrastive divergence, and pseudolikelihood
+                  for continuous-valued variables",
+  journal =      "{IEEE} Transactions on Neural Networks",
+  volume =       "18",
+  pages =        "1529--1531",
+  year =         "2007",
+}
+
+@article{HyvarinenA2008,
+ author = {Hyv\"{a}rinen,, Aapo},
+ title = {Optimal approximation of signal priors},
+ journal = {Neural Computation},
+ volume = {20},
+ number = {12},
+ year = {2008},
+ pages = {3087--3110},
+ publisher = {MIT Press},
+ address = {Cambridge, MA, USA},
+ }
+
+@article{kording2004,
+author={Konrad P. K{\"o}rding and Christoph Kayser and Wolfgang
+Einh{\"a}user and Peter K{\"o}nig},
+title = "How Are Complex Cell Properties Adapted to the Statistics of
+Natural Stimuli?",
+year = 2004,
+journal = "Journal of Neurophysiology",
+volume = 91,
+pages = {206--212},
+url="jn.physiology.org/cgi/reprint/91/1/206.pdf"
+}
+
+@inproceedings{Koster-Hyvarinen-2007,
+  author = {Urs K{\"{o}}ster and Aapo Hyv{\"{a}}rinen},
+ title = {A two-layer {ICA}-like model estimated by {S}core {M}atching},
+ booktitle = {Int. Conf. Artificial Neural Networks (ICANN'2007)},
+ pages = {798--807},
+ year = 2007,
+}
+
+@article{Iba-2001,
+  author =       "Yukito Iba",
+  title =        "Extended Ensemble Monte Carlo",
+  journal =      "International Journal of Modern Physics",
+  volume =       "C12",
+  pages =        "623--656",
+  year =         "2001",
+}
+
+@InProceedings{icml2009_093,
+  author =    {Hossein Mobahi and Ronan Collobert and Jason Weston},
+  title =     {Deep Learning from Temporal Coherence in Video},
+  booktitle = {Proceedings of the 26th International Conference on Machine Learning},
+  pages =     {737--744},
+  year =      2009,
+  editor =    {L\'{e}on Bottou and Michael Littman},
+  address =   {Montreal},
+  month =     {June},
+  publisher = {Omnipress}
+}
+
+@InProceedings{icann:Holger+Yoshua:1997,
+  author =       "Holger Schwenk and Yoshua Bengio",
+  booktitle =    "International Conference on Artificial Neural
+                 Networks",
+  title =        "{AdaBoosting} Neural Networks: Application to on-line
+                 Character Recognition",
+  publisher =    "Springer Verlag",
+  pages =        "967--972",
+  year =         "1997",
+}
+
+@Article{Ide1998,
+  author =       "Nancy Ide and Jean Veronis",
+  title =        "Introduction to the Special Issue on Word Sense
+                 Disambiguation: The State of the Art",
+  journal =      "Computational Linguistics",
+  volume =       "24",
+  number =       "1",
+  pages =        "1--40",
+  year =         "1998",
+}
+
+@Article{IEEE-KDE:Frasconi95,
+  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
+  title =        "Unified Integration of Explicit Rules and Learning by
+                 Example in Recurrent Networks",
+  journal =      "IEEE Transactions on Knowledge and Data Engineering",
+  volume =       "7",
+  number =       "2",
+  pages =        "340--346",
+  year =         "1995",
+  OPTmonth =     "",
+}
+
+@Article{igel05,
+  author =       "C. Igel and M. Toussaint and W. Weishui",
+  title =        "Rprop using the natural gradient compared to
+                 Levenberg-Marquardt optimization",
+  journal =      "Trends and Applications in Constructive Approximation.
+                 International Series of Numerical Mathematics.",
+  volume =       "151",
+  publisher =    "Birkhäuser Verlag",
+  pages =        "259--272",
+  year =         "2005",
+}
+
+@Article{intrator,
+  author =       "Nathan Intrator and Shimon Edelman",
+  title =        "How to make a low-dimensional representation suitable
+                 for diverse tasks",
+  journal =      "Connection Science, Special issue on Transfer in
+                 Neural Networks",
+  volume =       "8",
+  pages =        "205--224",
+  year =         "1996",
+}
+
+@Article{intrator96,
+  author =       "Nathan Intrator and Shimon Edelman",
+  title =        "How to make a low-dimensional representation suitable
+                 for diverse tasks",
+  journal =      "Connection Science, Special issue on Transfer in
+                 Neural Networks",
+  volume =       "8",
+  pages =        "205--224",
+  year =         "1996",
+}
+
+@Article{Inzenman-91,
+  author =       "A. J. Inzenman",
+  title =        "Recent developments in nonparametric density
+                 estimation",
+  journal =      "Journal of the American Statistical Association",
+  volume =       "86",
+  number =       "413",
+  pages =        "205--224",
+  year =         "1991",
+}
+
+@TechReport{IOHMM-TR,
+  author =       "Y. Bengio and P. Frasconi",
+  title =        "An {EM} Approach to Learning Sequential Behavior",
+  number =       "RT-DSI-11/94",
+  institution =  "University of Florence",
+  year =         "1994",
+}
+
+@InProceedings{Irie88,
+  author =       "B. Irie and S. Miyake",
+  booktitle =    "IEEE Second International Conference on Neural
+                 Networks, San Diego",
+  title =        "Capabilities of three layer perceptrons",
+  year =         "1988",
+}
+
+@Article{Irino+Kawahara90,
+  author =       "T. Irino and H. Kawahara",
+  title =        "A Method for Designing Neural Networks Using Nonlinear
+                 Multivariate Analysis: Application to
+                 Speaker-Independent Vowel Recognition",
+  journal =      "Neural Computation",
+  volume =       "2",
+  type =         "Letter",
+  number =       "3",
+  pages =        "386--397",
+  year =         "1990",
+}
+
+@article{ItoM2004,
+	author = {Ito, Minami   and Komatsu, Hidehiko  },
+	citeulike-article-id = {451606},
+	doi = {http://dx.doi.org/10.1523/JNEUROSCI.4364},
+	journal = {Journal of Neuroscience},
+	keywords = {cnv, v2},
+	month = {March},
+	number = {13},
+	pages = {3313--3324},
+	posted-at = {2007-03-30 11:19:11},
+	priority = {0},
+	title = {Representation of Angles Embedded within Contour Stimuli in Area V2 of Macaque Monkeys},
+	volume = {24},
+	year = {2004}
+}
+	%url = {http://dx.doi.org/10.1523/JNEUROSCI.4364},
+
+@Article{Jaakkola+Jordan99,
+  author =       "T. Jaakkola and M. I. Jordan",
+  title =        "Varitional methods and the {QMR}-{DT} database",
+  journal =      "Journal of Artificial Intelligence",
+  volume =       "10",
+  pages =        "291--322",
+  year =         "1999",
+}
+
+%I deprecated because the year in the tag is wrong
+@InProceedings{Jaakkola98,
+  author =       "Tommi S. Jaakkola and David Haussler",
+  editor =       NIPS11ed,
+  booktitle =    NIPS11,
+  title =        "Exploiting generative models in discriminative
+                 classifiers",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "487--493",
+  year =         "1999",
+}
+
+@InProceedings{Jaakkola99,
+  author =       "Tommi S. Jaakkola and David Haussler",
+  editor =       NIPS11ed,
+  booktitle =    NIPS11,
+  title =        "Exploiting generative models in discriminative
+                 classifiers",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "487--493",
+  year =         "1999",
+}
+
+@Misc{jaakkola98exploiting,
+  author =       "T. Jaakkola and D. Haussler",
+  title =        "Exploiting generative models in discriminative
+                 classifiers",
+  year =         "1998",
+  note =         "Preprint, Dept.of Computer Science, Univ. of California. 
+                  A shorter version is in Advances in Neural
+                  Information Processing Systems 11",
+  howpublished = "Available from http://www.cse.ucsc.edu/~haussler/pubs.html",
+}
+
+@Article{Jacobs-nc91,
+  author =       "R. A. Jacobs and M. I. Jordan and S. J. Nowlan and G.
+                 E. Hinton",
+  title =        "Adaptive mixture of local experts",
+  journal =      "Neural Computation",
+  volume =       "3",
+  pages =        "79--87",
+  year =         "1991",
+}
+
+@InCollection{Jacobs-nips91,
+  author =       "R. A. Jacobs and M. I. Jordan",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "A competitive modular connectionist architecture",
+  publisher =    "Morgan Kaufman Publishers",
+  address =      "San Mateo, CA",
+  year =         "1991",
+}
+
+@TechReport{Jacobs-tr90,
+  author =       "R. A. Jacobs and M. I. Jordan and A. G. Barto",
+  title =        "Task Decomposition Through Competition in a Modular
+                 Connectionist Architecture: The {What} and {Where}
+                 Vision Tasks",
+  number =       "COINS 90-27",
+  institution =  "MIT",
+  address =      "Cambridge MA",
+  year =         "1990",
+}
+
+@Article{Jacobs88,
+  author =       "R. A. Jacobs",
+  title =        "Increased Rates of Convergence Through Learning Rate
+                 Adaptation",
+  journal =      nn,
+  volume =       "1",
+  pages =        "295--307",
+  year =         "1988",
+}
+
+@Article{Jacobs91a,
+  author =       "Robert A. Jacobs and Michael I. Jordan and Steven J.
+                 Nowlan and Geoffrey E. Hinton",
+  title =        "Adaptive Mixtures of Local Experts",
+  journal =      nc,
+  volume =       "3",
+  pages =        "79--87",
+  year =         "1991",
+}
+
+@Article{Jacobs91b,
+  author =       "R. A. Jacobs and M. I. Jordan and A. G. Barto",
+  title =        "Task Decomposition Through Competition in a Modular
+                 Connectionist Architecture: The What and Where Vision
+                 Task",
+  journal =      "Cognitive Science",
+  volume =       "15",
+  pages =        "219--250",
+  year =         "1991",
+}
+
+@Article{Jacobs94,
+  author =       "R. A. Jacobs and S. M. Kosslyn",
+  title =        "Encoding Shape and Spatial Relations: The Role of
+                 Receptive Fields in Coordinating Complementary
+                 Representations",
+  journal =      "Cognitive Science",
+  year =         "1994",
+}
+
+@article{Jaeger-2007,
+    author = {Herbert Jaeger},
+    title = {Echo state network},
+    year = 2007,
+    journal = {Scholarpedia},
+    volume = 2,
+    number = 9,
+    pages = 2330,
+}
+
+@Article{Japkowicz2000,
+  author =       "Nathalie Japkowicz and Stephen J. Hanson and Mark A.
+                 Gluck",
+  title =        "Nonlinear Autoassociation is not Equivalent to {PCA}",
+  journal =      "Neural Computation",
+  volume =       "12",
+  number =       "3",
+  pages =        "531--545",
+  year =         "2000",
+}
+
+@Article{Japkowicz2002,
+  author =       "N. Japkowicz and S. Stephen",
+  title =        "The Class Imbalance Problem: {A} Systematic Study",
+  journal =      "Intelligent Data Analysis",
+  volume =       "6",
+  number =       "5",
+  year =         "2002",
+}
+
+@inproceedings {Jarrett-ICCV2009,
+ original = "orig/jarrett-iccv-09.pdf",
+ title = "What is the Best Multi-Stage Architecture for Object Recognition?",
+ author = "Jarrett, Kevin and Kavukcuoglu, Koray and Ranzato, {Marc'Aurelio} and {LeCun}, Yann",
+ booktitle = "Proc. International Conference on Computer Vision (ICCV'09)",
+ publisher = "IEEE",
+ year = "2009"
+}
+
+@TechReport{Jauvin+Bengio-TR2003,
+  author =       "Christian Jauvin and Yoshua Bengio",
+  title =        "A Sense-Smoothed Bigram Language Model",
+  number =       "1233",
+  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
+  year =         "2003",
+}
+
+@Book{Jaynes03,
+  author =       "E. T. Jaynes",
+  title =        "{Probability} {Theory}: {The} {Logic} of {Science}",
+  publisher =    "Cambridge University Press",
+  year =         "2003",
+}
+
+@InCollection{Jaynes83,
+  author =       "E. T. Jaynes",
+  booktitle =    "Papers on Probability, Statistics and Statistical
+                 Physics",
+  title =        "{Bayesian} intervals versus confidence intervals",
+  publisher =    "Kluwer",
+  year =         "1983",
+  editors =      "R. D. Rosencrantz",
+}
+
+@Article{JCB:Baldi95t,
+  author =       "Y. Chauvin and P. Baldi",
+  title =        "Hidden Markov models of the {G}-Protein-Coupled
+                 receptor family",
+  journal =      "Journal of Computational Biology",
+  year =         "1995",
+}
+
+@InProceedings{jebara03,
+  author =       "Tony Jebara and Risi Kondor",
+  booktitle =    colt03,
+  title =        "{Bhattacharyya and Expected Likelihood Kernels}",
+  year =         "2003",
+}
+
+@InProceedings{Jebara03Convex,
+  author =       "T. Jebara",
+  editor =       "",
+  booktitle =    "Proceedings of AISTATS 2003",
+  title =        "Convex Invariance Learning",
+  publisher =    "",
+  pages =        "",
+  year =         "2003",
+}
+
+@InProceedings{jebara04,
+  author =       "Tony Jebara",
+  booktitle =    ICML04,
+  editor =       ICML04ed,
+  publisher =    ICML04publ,
+  title =        "{Multi-task feature and kernel selection for SVMs}",
+  address =      "New York, NY, USA",
+  year =         "2004",
+  location =     "Banff, Alberta, Canada",
+}
+
+@Book{JebaraT2003,
+  author =       "Tony Jebara",
+  title =        "Machine Learning: Discriminative and Generative (The
+                 Kluwer International Series in Engineering and Computer
+                 Science)",
+  howpublished = "Hardcover",
+  publisher =    "Springer",
+  month =        dec,
+  year =         "2003",
+  citeulike-article-id = "134203",
+  comment =      "- maximum entropy discriminative as unification of
+                 discriminative and generative approaches",
+  keywords =     "book, generative-discriminative, svm",
+  priority =     "2",
+}
+  %ISBN =         "1402076479",
+  %URL =          "http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20&path=ASIN/1402076479",
+
+@InCollection{Jelinek+Mercer80,
+  author =       "F. Jelinek and R. L. Mercer",
+  editor =       "E. S. Gelsema and L. N. Kanal",
+  booktitle =    "Pattern Recognition in Practice",
+  title =        "Interpolated estimation of Markov source parameters
+                 from sparse data",
+  publisher =    "North-Holland, Amsterdam",
+  year =         "1980",
+}
+
+@InProceedings{Jelinek-Chelba-99,
+  author =       "Frederick Jelinek and Ciprian Chelba",
+  booktitle =    "European Conference on Speech Communication and
+                 Technology",
+  title =        "Putting language into language modeling",
+  volume =       "1",
+  address =      "Budapest",
+  pages =        "KN1--KN5",
+  year =         "1999",
+}
+
+@Article{Jelinek76,
+  author =       "F. Jelinek",
+  title =        "Continuous speech recognition by statistical methods",
+  journal =      "Proceedings of the IEEE",
+  volume =       "64",
+  pages =        "532--556",
+  year =         "1976",
+}
+
+@InCollection{Jelinek80,
+  author =       "F. Jelinek and R. L. Mercer",
+  editor =       "E. S. Gelsema and L. N. Kanal",
+  booktitle =    "Pattern Recognition in Practice",
+  title =        "Interpolated Estimation of {Markov} Source Parameters
+                 from Sparse Data",
+  publisher =    "North-Holland",
+  address =      "Amsterdam",
+  year =         "1980",
+  copy =         yes,
+}
+
+@Book{Jelinek98,
+  author =       "F. Jelinek",
+  title =        "Statistical Methods for Speech Recognition",
+  publisher =    "MIT Press",
+  address =      "Cambridge, Massachussetts",
+  year =         "1998",
+}
+
+@InProceedings{JensenRiis2000,
+  author =       "K. J. Jensen and S. Riis",
+  booktitle =    "International Conference on Spoken Language
+                 Processing",
+  title =        "Self-organizing letter code-book for text-to-phoneme
+                 neural network model",
+  volume =       "3",
+  pages =        "318--321",
+  year =         "2000",
+}
+
+@InProceedings{Jeong96,
+  author =       "E. Jeong and K. Furuta and S. Kondo",
+  booktitle =    nipc-hmit96,
+  title =        "Identification of Transient in Nuclear Power Plant
+                 using Adaptive Template Matching with Neural Network",
+  volume =       "1",
+  publisher =    ans,
+  pages =        "243--250",
+  year =         "1996",
+}
+
+@InCollection{joachims99largescaleSVM,
+  author =       "T. Joachims",
+  editor =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
+  booktitle =    "Advances in Kernel Methods --- Support Vector
+                 Learning",
+  title =        "Making large-Scale {SVM} Learning Practical",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "1999",
+}
+
+@InProceedings{joachims99transductive,
+  author =       "Thorsten Joachims",
+  booktitle =    ICML99,
+  editor =       ICML99ed,
+  publisher =    ICML99publ,
+  title =        "Transductive Inference for Text Classification using
+                 Support Vector Machines",
+  address =      "Bled, SL",
+  pages =        "200--209",
+  year =         "1999",
+}
+  %URL =          "citeseer.ist.psu.edu/joachims99transductive.html",
+
+@TechReport{Johansson90,
+  author =       "E. M. Johansson and F. U. Dowla and D. M. Goodman",
+  title =        "Backpropagation learning for multi-layer feed-forward
+                 neural networks using the conjugate gradient method",
+  number =       "UCRL-JC-104850",
+  institution =  "Lawrence Livermore National Laboratory",
+  month =        sep,
+  year =         "1990",
+}
+
+@inproceedings{John+al-1994,
+    author = {John, George  H.  and Kohavi, Ron  and Pfleger, Karl},
+    booktitle = {Proceedings of the Eleventh International Conference on Machine Learning},
+    pages = {121--129},
+    title = {Irrelevant Features and the Subset Selection Problem},
+    url = {http://citeseer.ist.psu.edu/john94irrelevant.html},
+    year = {1994},
+    publisher = {Morgan Kaufmann},
+}
+
+@Article{Johnson89,
+  author =       "D. S. Johnson and C. R. Aragon and L. A. McGeoch and
+                 C. Schevon",
+  title =        "Optimization by Simulated Annealing: An Experimental
+                 Evaluation; Part {I}, Graph Partitioning",
+  journal =      opres,
+  volume =       "37",
+  pages =        "865--891",
+  year =         "1989",
+}
+
+@InProceedings{Joines92QQ23,
+  author =       "J. A. Joines and M. W. White",
+  booktitle =    "IJCNN",
+  title =        "Improved Generalization Using Robust Cost Functions",
+  address =      "Baltimore, Maryland",
+  pages =        "911--918",
+  month =        jun,
+  year =         "1992",
+  ref =          "QQ23",
+}
+
+@Book{Jolliffe86,
+  author =       "Ian T. Jolliffe",
+  title =        "Principal Component Analysis",
+  publisher =    "Springer-Verlag",
+  address =      "New York",
+  year =         "1986",
+}
+
+@book{Jolliffe-2002,
+    author = {Ian T. Jolliffe},
+    citeulike-article-id = {1154147},
+    howpublished = {Hardcover},
+    isbn = {0387954422},
+    month = {October},
+    posted-at = {2007-03-11 15:04:57},
+    priority = {2},
+    publisher = {Springer},
+    title = {Principal Component Analysis},
+    url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387954422},
+    year = {2002}
+}
+
+@Article{Jordan+Jacobs94,
+  author =       "M. I. Jordan and R. A. Jacobs",
+  title =        "Hierarchical mixtures of experts and the {E}{M}
+                 algorithm",
+  journal =      nc,
+  volume =       "6",
+  pages =        "181--214",
+  year =         "1994",
+}
+
+@TechReport{Jordan+Xu93,
+  author =       "Michael I. Jordan and L. Xu",
+  title =        "Convergence results for the {EM} approach to mixtures
+                 of experts architecture",
+  number =       "9303",
+  institution =  "MIT Computational Cognitive Science",
+  month =        sep,
+  year =         "1993",
+}
+
+@Article{Jordan-cs92,
+  author =       "M. I. Jordan and D. E. Rumelhart",
+  title =        "Forward models: Supervised learning with a distal
+                 teacher",
+  journal =      "Cognitive Science",
+  volume =       "16",
+  pages =        "307--354",
+  year =         "1992",
+}
+
+@InProceedings{Jordan-HMDT97,
+  author =       "M. Jordan and Z. Ghahramani and L. Saul",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "Hidden Markov decision trees",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "",
+  year =         "1997",
+}
+
+@InProceedings{Jordan-nips92,
+  author =       "M. I. Jordan and R. A. Jacobs",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Hierarchies of adaptive experts",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "985--992",
+  year =         "1992",
+}
+
+@TechReport{Jordan-tr86,
+  author =       "M. I. Jordan",
+  title =        "Serial Order: a Parallel Distributed Processing
+                 Approach",
+  number =       "8604",
+  institution =  "ICS (Institute for Cognitive Science, University of
+                 California)",
+  year =         "1986",
+}
+
+@InProceedings{Jordan86,
+  author =       "M. I. Jordan",
+  booktitle =    "Proceedings of the Eighth Annual Conference of the
+                 Cognitive Science Society",
+  title =        "Attractor Dynamics and Parallelism in a Connectionist
+                 Sequential Machine",
+  publisher =    "Lawrence Erlbaum, Hillsdale",
+  address =      "Amherst 1986",
+  pages =        "531--546",
+  year =         "1986",
+}
+
+@TechReport{Jordan88,
+  author =       "M. I. Jordan",
+  title =        "Supervised Learning and Systems with Excess Degrees of
+                 Freedom",
+  number =       "COINS Technical Report 88-27",
+  institution =  "MIT",
+  address =      "Cambridge MA",
+  year =         "1988",
+}
+
+@InCollection{Jordan89,
+  author =       "M. I. Jordan",
+  editor =       "J. L. Elman and D. E. Rumelhart",
+  booktitle =    "Advances in Connectionist Theory: Speech",
+  title =        "Serial Order: {A} Parallel, Distributed Processing
+                 Approach",
+  publisher =    "Lawrence Erlbaum",
+  address =      "Hillsdale",
+  year =         "1989",
+}
+
+@InProceedings{Jordan89b,
+  author =       "M. I. Jordan",
+  editor =       "G. Hinton and D. S. Touretzky",
+  booktitle =    "Proceedings of the 1988 Connectionist Models Summer
+                 School",
+  title =        "Supervised learning and systems with excess degrees of
+                 freedom",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1989",
+}
+
+@InCollection{Jordan90,
+  author =       "M. I. Jordan",
+  editor =       "M. Jeannerod",
+  booktitle =    "Attention and Performance XIII",
+  title =        "Motor learning and the degrees of freedom problem",
+  publisher =    "Hillsdale, NJ: Erlbaum",
+  year =         "1990",
+}
+
+@Book{Jordan98,
+  author =       "M. I. Jordan",
+  title =        "Learning in Graphical Models",
+  publisher =    "Kluwer",
+  address =      "Dordrecht, Netherlands",
+  year =         "1998",
+}
+
+@Article{Jour:Freund:AdaBoostDetailed,
+  author =       "Yoav Freund and Robert E. Schapire",
+  title =        "A decision theoretic generalization of on-line
+                 learning and an application to Boosting",
+  journal =      "Journal of Computer and System Science",
+  volume =       "55",
+  number =       "1",
+  pages =        "119--139",
+  year =         "1997",
+}
+
+@Article{Jour:Freund:boost,
+  author =       "Yoav Freund",
+  title =        "Boosting a weak learning algorithm by majority",
+  journal =      "Information and Computation",
+  volume =       "121",
+  number =       "2",
+  pages =        "256--285",
+  year =         "1995",
+}
+
+@Article{Jour-Freund-AdaBoostDetailed,
+  author =       "Yoav Freund and Robert E. Schapire",
+  title =        "A decision theoretic generalization of on-line
+                 learning and an application to Boosting",
+  journal =      "Journal of Computer and System Science",
+  volume =       "55",
+  number =       "1",
+  pages =        "119--139",
+  year =         "1997",
+}
+
+@PhdThesis{Jouvet88,
+  author =       "D. Jouvet",
+  title =        "Reconnaissance de Mots Connectes Independamment du
+                 Locuteur par des Methodes Statistiques",
+  number =       "NST-88E006",
+  school =       "Ecole National Superieure des Telecommunications",
+  year =         "1988",
+}
+
+@inproceedings{JuanA2001,
+ author = {Alfons Juan and Enrique Vidal},
+ title = {On the use of Bernoulli Mixture Models for Text Classification},
+ booktitle = {PRIS '01: Proceedings of the 1st International Workshop on Pattern Recognition in Information Systems},
+ year = {2001},
+ pages = {118--126},
+ publisher = {ICEIS Press},
+ }
+
+@inproceedings{JuanA2004,
+ author = {Alfons Juan and Enrique Vidal},
+ title = {Bernoulli Mixture Models for Binary Images},
+ booktitle = {ICPR '04: Proceedings of the Pattern Recognition, 17th International Conference on (ICPR'04) Volume 3},
+ year = {2004},
+ pages = {367--370},
+ publisher = {IEEE Computer Society},
+ address = {Washington, DC, USA},
+ }
+
+@Article{Juang92,
+  author =       "B. H. Juang and S. Katagiri",
+  title =        "Discriminative learning for minimum error
+                 classification",
+  journal =      "IEEE Transactions on Signal Processing",
+  volume =       "40",
+  number =       "12",
+  pages =        "3043--3054",
+  year =         "1992",
+}
+
+@Article{Judd88,
+  author =       "S. Judd",
+  title =        "On the complexity of loading shallow neural networks",
+  journal =      "Journal of Complexity",
+  volume =       "4",
+  pages =        "177--192",
+  year =         "1988",
+}
+
+@Book{JuddBook,
+  author =       "J. S. Judd",
+  title =        "Neural Network Design and the Complexity of Learning",
+  publisher =    "MIT press",
+  year =         "1989",
+}
+
+@book{Jurafsky+Martin-2008,
+    author = {Jurafsky, Daniel and Martin, James  H.},
+    howpublished = {Hardcover},
+    month = {January},
+    publisher = {Prentice Hall},
+    edition = 2,
+    title = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics and Speech Recognition},
+    year = {2008}
+}
+
+@Article{Jutten+Herault-91,
+  author =       "Christian Jutten and Jeanny Herault",
+  title =        "Blind separation of sources, part {I}: an adaptive
+                 algorithm based on neuromimetic architecture",
+  journal =      "Signal Processing",
+  volume =       "24",
+  pages =        "1--10",
+  year =         "1991",
+}
+
+@InProceedings{Kahng89,
+  author =       "A. B. Kahng",
+  booktitle =    ijcnn,
+  title =        "Travelling Salesman Heuristics and Embedding Dimension
+                 in the Hopfield Model",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "513--520",
+  year =         "1989",
+}
+
+@InProceedings{kai03,
+  author =       "Yu Kai and Schwaighofer Anton and Tresp Volker and Ma
+                 Wei-Ying and Zhang HongJiang",
+  booktitle =    UAI03,
+  title =        "Collaborative Ensemble Learning: Combining
+                 Collaborative and Content-Based Information Filtering
+                 via Hierarchical Bayes",
+  publisher =    "Morgan Kaufmann Publishers",
+  address =      "San Francisco, CA",
+  pages =        "616--623",
+  year =         "2003",
+}
+
+@Article{Kalman61,
+  author =       "R. Kalman and R. S. Bucy",
+  title =        "New results in linear filtering and prediction",
+  journal =      "Journal of Basic Engineering (ASME)",
+  volume =       "83D",
+  pages =        "95--108",
+  year =         "1961",
+}
+
+@article{Kambhatla+Leen-1997,
+    author = {Kambhatla, N.  and Leen, T. K. },
+    journal = {Neural Computation},
+    pages = {1493--1516},
+    title = {Dimension Reduction by Local Principal Component Analysis},
+    volume = {9},
+    year = {1997}
+}
+
+@Article{Kammen88,
+  author =       "D. M. Kammen and A. L. Yuille",
+  title =        "Spontaneous Symmetry-Breaking Energy Functions and the
+                 Emergence of Orientation Selective Cortical Cells",
+  journal =      biocyb,
+  volume =       "59",
+  pages =        "23--31",
+  year =         "1988",
+}
+
+@InProceedings{Kammerer89,
+  author =       "B. K. Kammerer and W. A. Kupper",
+  booktitle =    ijcnn,
+  title =        "Design of Hierarchical Perceptron Structures and their
+                 Application to the Task of Isolated Word Recognition",
+  address =      "Washington D.C.",
+  year =         "1989",
+}
+
+@Book{Kandel85,
+  author =       "E. R. Kandel and J. H. Schwartz",
+  title =        "Principles of Neural Science",
+  publisher =    "Elsevier",
+  address =      "New York",
+  edition =      "2",
+  year =         "1985",
+}
+
+@Article{Kanter87,
+  author =       "I. Kanter and H. Sompolinsky",
+  title =        "Associative Recall of Memory Without Errors",
+  journal =      prA,
+  volume =       "35",
+  pages =        "380--392",
+  year =         "1987",
+}
+
+@inproceedings{KarklinY2003,
+  author    = {Yan Karklin and
+               Michael S. Lewicki},
+  title     = {A Model for Learning Variance Components of Natural Images},
+  year      = {2003},
+  pages     = {1367-1374},
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  publisher =    "{MIT} Press",
+}
+
+@Article{Karmin90,
+  author =       "E. D. Karmin",
+  title =        "A simple procedure for pruning back-propagation
+                 trained neural networks",
+  journal =      ieeetrnn,
+  volume =       "1",
+  number =       "2",
+  pages =        "239--242",
+  year =         "1990",
+}
+
+@Article{Karplus97,
+  author =       "K. Karplus and K. Sjolander and C. Barrett and M.
+                 Cline and D. Haussler and R. Hughey and L. Holm and C.
+                 Sander",
+  title =        "Predicting protein structure using hidden Markov
+                 models",
+  journal =      "Proteins: Structure, Function and Genetics",
+  volume =       "S 1",
+  number =       "1",
+  pages =        "134--139",
+  year =         "1997",
+}
+
+@PhdThesis{KasselR1995,
+  author = 	 {Robert Kassel},
+  title = 	 {A Comparison of Approaches to On-line Handwritten Character Recognition},
+  school = 	 {MIT Spoken Language Systems Group},
+  year = 	 {1995},
+}
+
+@Article{Katz87,
+  author =       "Slava M. Katz",
+  title =        "Estimation of Probabilities from Sparse Data for the
+                 Language Model Component of a Speech Recognizer",
+  journal =      "IEEE Transactions on Acoustics, Speech, and Signal
+                 Processing",
+  volume =       "ASSP-35",
+  number =       "3",
+  pages =        "400--401",
+  month =        mar,
+  year =         "1987",
+}
+
+@InCollection{Kaul,
+  author =       "G. Kaul",
+  editor =       "G. S. Maddala and C. R. Rao",
+  booktitle =    "Handbook of Statistics, Vol. 14",
+  title =        "Predictable Components in Stock Returns",
+  publisher =    "Elsevier Science",
+  pages =        "269--296",
+  year =         "1996",
+}
+
+@InProceedings{kbnn-craven.mlc93,
+  author =       "Mark W. Craven and Jude W. Shavlik",
+  booktitle =    "Proceedings of the Tenth International Conference on
+                 Machine Learning",
+  title =        "Learning Symbolic Rules Using Artificial Neural
+                 Networks",
+  publisher =    "Morgan Kaufmann",
+  address =      "Amherst, MA",
+  pages =        "73--80",
+  year =         "1993",
+}
+
+@InProceedings{kbnn-maclin.aaai92,
+  author =       "R. Maclin and J. Shavlik",
+  booktitle =    "Proceedings of the Tenth National Conference on
+                 Artificial Intelligence",
+  title =        "Using Knowledge-Based Neural Networks to Improve
+                 Algorithms: Refining the Chou-Fasman Algorithm for
+                 Protein Folding",
+  address =      "San Jose, CA",
+  pages =        "165--170",
+  year =         "1992",
+}
+
+@TechReport{kbnn-maclin.mlrgwp91,
+  author =       "R. Maclin and J. W. Shavlik",
+  title =        "Refining Algorithms with Knowledge-Based Neural
+                 Networks: Improving the Chou-Fasman Algorithm for
+                 Protein Folding",
+  number =       "Machine Learning Research Group Working Paper 91-2",
+  institution =  "Department of Computer Sciences, University of
+                 Wisconsin",
+  year =         "1991",
+  note =         "also in Computational Learning Theory and Natural
+                 Learning Systems, volume 1, S. Hanson, G. Drastal, and
+                 R. Rivest, (eds.), MIT Press",
+}
+
+@InProceedings{kbnn-noordewier.nips3,
+  author =       "Michiel O. Noordewier and Geoffrey G. Towell and Jude
+                 W. Shavlik",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "Training Knowledge-Based Neural Networks to Recognize
+                 Genes in {DNA} Sequences",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "530--536",
+  year =         "1991",
+}
+
+@InProceedings{kbnn-opitz.ijcai93,
+  author =       "D. W. Opitz and J. W. Shavlik",
+  booktitle =    "Proceedings of the Thirteenth International Joint
+                 Conference on Artificial Intelligence",
+  title =        "Heuristically Expanding Knowledge-Based Neural
+                 Networks",
+  address =      "Chambery, France",
+  month =        sep,
+  year =         "1993",
+}
+
+@TechReport{kbnn-opitz.mlrgwp92,
+  author =       "D. W. Opitz and J. W. Shavlik",
+  title =        "Using Heuristic Search to Expand Knowledge-Based
+                 Neural Networks",
+  number =       "Machine Learning Research Group Working Paper 92-1",
+  institution =  "Department of Computer Sciences, University of
+                 Wisconsin",
+  year =         "1992",
+  note =         "(also in Computational Learning Theory and Natural
+                 Learning Systems, volume 3, T. Petsche, S. Judd, and S.
+                 Hanson, (eds.), MIT Press)",
+}
+
+@TechReport{kbnn-shavlik.tr92,
+  author =       "J. W. Shavlik",
+  title =        "A Framework for Combining Symbolic and Neural
+                 Learning",
+  number =       "UW TR 1123",
+  institution =  "Department of Computer Sciences, University of
+                 Wisconsin",
+  year =         "1992",
+  note =         "(a shorter version will appear in Machine Learning)",
+}
+
+@InProceedings{kbnn-towell.aaai90,
+  author =       "G. G. Towell and J. W. Shavlik and M. O. Noordewier",
+  booktitle =    "Proceedings of the Eighth National Conference on
+                 Artificial Intelligence",
+  title =        "Refinement of Approximate Domain Theories by
+                 Knowledge-Based Neural Networks",
+  address =      "Boston, MA",
+  pages =        "861--866",
+  year =         "1990",
+}
+
+@InProceedings{kbnn-towell.aaai92,
+  author =       "G. Towell and J. Shavlik",
+  booktitle =    "Proceedings of the Tenth National Conference on
+                 Artificial Intelligence",
+  title =        "Using Symbolic Learning to Improve Knowledge-Based
+                 Neural Networks",
+  address =      "San Jose, CA",
+  pages =        "177--182",
+  year =         "1992",
+}
+
+@Article{kbnn-towell.aij94,
+  author =       "Geoffrey G. Towell and Jude W. Shavlik",
+  title =        "Knowledge-Based Neural Networks",
+  journal =      "Artificial Intelligence",
+  year =         "1994",
+  note =         "undergoing 2nd review",
+}
+
+@InCollection{kbnn-towell.ml493,
+  author =       "Geoffrey G. Towell and Jude W. Shavlik",
+  editor =       "R. S. Michalski and G. Tecuci",
+  booktitle =    "Machine Learning: An Integrated Approach",
+  title =        "Refining Symbolic Knowledge Using Neural Networks",
+  volume =       "4",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1993",
+}
+
+@InProceedings{kbnn-towell.mlc91,
+  author =       "Geoffrey G. Towell and Mark W. Craven and Jude W.
+                 Shavlik",
+  booktitle =    "Proceedings of the Eighth International Machine
+                 Learning Workshop",
+  title =        "Constructive Induction in Knowledge-Based Neural
+                 Networks",
+  publisher =    "Morgan Kaufmann",
+  address =      "Evanston, IL",
+  pages =        "213--217",
+  year =         "1991",
+}
+
+@Article{kbnn-towell.mlj93,
+  author =       "Geoffrey G. Towell and Jude W. Shavlik",
+  title =        "The Extraction of Refined Rules from Knowledge-Based
+                 Neural Networks",
+  journal =      "Machine Learning",
+  volume =       "13",
+  number =       "1",
+  pages =        "71--101",
+  year =         "1993",
+}
+
+@InProceedings{kbnn-towell.nips4,
+  author =       "Geoffrey G. Towell and Jude W. Shavlik",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Interpretation of Artificial Neural Networks: Mapping
+                 knowledge-based Neural Networks into Rules",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  year =         "1992",
+}
+
+@PhdThesis{kbnn-towell.thesis,
+  author =       "Geoffrey G. Towell",
+  title =        "Symbolic Knowledge and Neural Networks: Insertion,
+                 Refinement and Extraction",
+  school =       "University of Wisconsin -- Madison",
+  year =         "1991",
+  note =         "(Also appears as UW Technical Report 1072 [out of
+                 print].)",
+}
+
+@InProceedings{Kearns+Ron97,
+  author =       "Michael Kearns and Dana Ron",
+  booktitle =    "Tenth Annual Conference on Computational Learning
+                 Theory,",
+  title =        "Algorithmic Stability and Sanity-Check Bounds for
+                 Leave-One-Out Cross-Validation",
+  publisher =    "Morgan Kaufmann",
+  pages =        "152--162",
+  year =         "1997",
+}
+
+@InCollection{keeler-rumelhart-91,
+  author =       "J. Keeler and {W.-K.} {Rumelhart, D.and Leow}",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "integrated segmentation and recognition of
+                 hand-printed numerals",
+  publisher =    "Morgan Kaufmann Publishers, San Mateo, CA",
+  pages =        "557--563",
+  year =         "1991",
+}
+
+@Article{Keerthi+Lin-2003,
+  author =       "S. Sathiya Keerthi and Chih-Jen Lin",
+  title =        "Asymptotic Behaviors of Support Vector Machines with
+                 {Gaussian} Kernel",
+  journal =      "Neural Computation",
+  volume =       "15",
+  number =       "7",
+  pages =        "1667--1689",
+  year =         "2003",
+}
+
+@InCollection{Kegl-2003,
+  author =       "Bal\'{a}zs K\'{e}gl",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Intrinsic Dimension Estimation Using Packing Numbers",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "681--688",
+  year =         "2003",
+}
+
+@Article{Kegl-Krzyzak-2002,
+  author =       "B. Kegl and A. Krzyzak",
+  title =        "Piecewise linear skeletonization using principal
+                 curves",
+  journal =      "{IEEE} Transactions on Pattern Analysis and Machine
+                 Intelligence",
+  volume =       "24",
+  number =       "1",
+  pages =        "59--74",
+  year =         "2002",
+}
+
+@InProceedings{Kegl2003,
+  author =       "B. Kegl",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Intrinsic dimension estimation using packing numbers",
+  publisher =    "The {MIT} Press",
+  year =         "2003",
+}
+
+@InCollection{kegl2005,
+  author =       "Bal\'{a}zs K\'{e}gl and Ligen Wang",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Boosting on Manifolds: Adaptive Regularization of Base
+                 Classifiers",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2005",
+}
+
+@TechReport{Kehagias89,
+  author =       "A. Kehagias",
+  title =        "Stochastic Recurrent Networks: Prediction and
+                 Classification of Time Series",
+  institution =  "Brown University. Division of Applied Mathematics",
+  address =      "Providence, RI 02912",
+  year =         "1991",
+}
+
+@InProceedings{KellerM2005,
+  author =       "M. Keller and S. Bengio",
+  booktitle =    "Proceedings of the 15th International Conference on
+                 Artificial Neural Networks: Biological Inspirations,
+                 ICANN, Lecture Notes in Computer Science",
+  title =        "A neural network for text representation",
+  volume =       "LNCS 3697",
+  pages =        "667--672",
+  year =         "2005",
+  teditor =      "Springer-Verlag",
+}
+
+@inproceedings{Keller2007,
+ author = {Katherine A. Heller and Zoubin Ghahramani}, 
+ booktitle =    aistats07,
+ year = 2007, 
+ title = {A Nonparametric Bayesian Approach to Modeling Overlapping Clusters},
+ publisher =    "Omnipress",
+ date =         "March 21-24, 2007",
+ address =      "San Juan, Porto Rico",
+ pages =        "187-194",
+}
+
+@inproceedings{Keller2008,
+ author = {Katherine A. Heller and Sinead Williamson and Zoubin Ghahramani}, 
+ year = 2008, 
+ title = {Statistical models for partial membership}, 
+ booktitle = ICML08,
+ editor =    ICML08ed,
+ publisher = ICML08publ,
+ location =  {Helsinki, Finland},
+ pages = "392--399",
+}
+
+@Book{Kelly1975,
+  author =       "Edward Kelly and Philip Stone",
+  title =        "Computer recognition of english word senses",
+  publisher =    "North-Holland Linguistics Series",
+  year =         "1975",
+}
+
+@InProceedings{Kemp+al-2004,
+  author =       "C. Kemp and T. L. Griffiths and S. Stromsten and J. B.
+                 Tenembaum",
+  editor =       NIPS16ed,
+  booktitle =    NIPS16,
+  title =        "Semi-supervised learning with trees",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  year =         "2004",
+}
+
+@inproceedings{Kerr2007,
+ author = {Wesley Kerr and Shane Hoversten and Daniel Hewlett and Paul R. Cohen and Yu-Han Chang},
+ title = {Learning in Wubble World},
+ booktitle = {Proceedings of the IEEE Int. Conference on Development and Learning},
+ year = 2007,
+}
+
+@Article{Kerszberg90,
+  author =       "M. Kerszberg and A. Zippelius",
+  title =        "Synchronization in Neural Assemblies",
+  journal =      pscrip,
+  volume =       "T33",
+  pages =        "54--64",
+  year =         "1990",
+}
+
+@InProceedings{Keysers2000,
+  author =       "D. Keysers and J. Dahmen and H. Ney",
+  booktitle =    "22nd Symposium of the German Association for Pattern
+                 Recognition",
+  title =        "A probabilistic view on tangent distance",
+  address =      "Kiel, Germany",
+  year =         "2000",
+}
+
+@Book{Khalil92,
+  author =       "Hassan K. Khalil",
+  title =        "Nonlinear Systems",
+  publisher =    "Macmillan Publishing Company",
+  address =      "New York",
+  year =         "1992",
+}
+
+@Book{Kiang65,
+  author =       "N. Y. S. Kiang and T. Watanabe and E. C. Thomas and L.
+                 F. Clark",
+  title =        "Discharge patterns of single fibers in the cat's
+                 auditory nerve fiber",
+  publisher =    "Cambdrige, MA: MIT Press",
+  year =         "1965",
+}
+
+@Article{Kiefer80,
+  author =       "N. M. Kiefer",
+  title =        "A note on switching regressions and logistic
+                 discrimination",
+  journal =      "Econometrica",
+  volume =       "48",
+  pages =        "1065--1069",
+  year =         "1980",
+}
+
+@Misc{Kilgarriff2000,
+  author =       "Adam Kilgarriff and Joseph Rosenzweig",
+  title =        "English {SENSEVAL}: Report and Results",
+  year =         "2000",
+  URL =          "citeseer.nj.nec.com/335615.html",
+  text =         "A. Kilgarriff and J. Rosenzweig. English SENSEVAL:
+                 Report and Results. In Proceedings of the 2nd
+                 International Conference on Language Resources and
+                 Evaluation, LREC, Athens, Greece.",
+}
+
+@InProceedings{Kilgarriff2002,
+  author =       "Adam Kilgarriff",
+  booktitle =    "Proceedings of Senseval-2",
+  title =        "English lexical sample task description",
+  organization = "ACL workshop",
+  year =         "2002",
+}
+
+@Article{Kim94,
+  author =       "C. J. Kim",
+  title =        "Dynamical linear models with Markov-switching",
+  journal =      "Journal of Econometrics",
+  volume =       "60",
+  pages =        "1--22",
+  year =         "1994",
+}
+
+@Article{Kimeldorf-Wahba-71,
+  author =       "G. Kimeldorf and G. Wahba",
+  title =        "Some results on {Tchebychean} spline functions",
+  journal =      "Journal of Mathematics Analysis and Applications",
+  volume =       "33",
+  pages =        "82--95",
+  year =         "1971",
+}
+
+@InCollection{Kinzel90,
+  author =       "W. Kinzel and M. Opper",
+  editor =       "E. Domany and J. L. van Hemmen and K. Schulten",
+  booktitle =    "Physics of Neural Networks",
+  title =        "Dynamics of Learning",
+  volume =       "1",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  year =         "1990",
+}
+
+@inproceedings{Kira+Rendell-1992,
+    author    = {Kenji Kira and Larry A. Rendell},
+    title     = {The Feature Selection Problem: Traditional Methods and a New Algorithm},
+    booktitle = {Proceedings of the Tenth National Conference on Artificial Intelligence},
+    year      = {1992},
+    pages     = {129-134},
+    bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@inproceedings{Kira+Rendell-1992b,
+    address = {San Francisco, CA, USA},
+    author = {Kenji Kira and Larry A. Rendell},
+    booktitle = {Proceedings of the Ninth International Conference on Machine learning},
+    isbn = {15586247X},
+    pages = {249--256},
+    posted-at = {2007-02-07 04:40:40},
+    publisher = {Morgan Kaufmann},
+    title = {A practical approach to feature selection},
+    url = {http://portal.acm.org/citation.cfm?id=142034},
+    year = {1992}
+}
+
+@Book{Kirk70,
+    author =       "D. E. Kirk",
+    title =        "Optimal Control Theory: an Introduction",
+    publisher =    "Prentice Hall",
+    address =      "Englewood Cliffs NJ",
+    year =         "1970",
+}
+
+@Book{Kirk70a,
+  author =       "D. E. Kirk",
+  title =        "Optimal Control Theory: an Introduction",
+  publisher =    "Prentice Hall",
+  address =      "Englewood Cliffs NJ",
+  year =         "1970",
+}
+
+@Article{Kirkpatrick83,
+  author =       "S. Kirkpatrick and C. D. Gelatt Jr. and and M. P.
+                 Vecchi",
+  title =        "Optimization by Simulated Annealing",
+  journal =      science,
+  volume =       "220",
+  pages =        "671--680",
+  year =         "1983",
+}
+
+@Article{Kirkpatrick85,
+  author =       "S. Kirkpatrick and G. Toulouse",
+  title =        "Configuration Space Analysis of Travelling Salesman
+                 Problems",
+  journal =      jpp,
+  volume =       "46",
+  pages =        "1277--1292",
+  year =         "1985",
+}
+
+@Book{kitagawa+gersch96,
+  author =       "G. Kitagawa and W. Gersch",
+  title =        "Smoothness priors analysis of time series",
+  publisher =    "Eds. P. Bickel and P. Diggle and S. Fienberg and K.
+                 Krickeberg and I. Olkin and W. Wermuth and S. Zeger,
+                 Lecture Notes in Statistics, volume 116",
+  year =         "1996",
+}
+
+@Article{kitagawa87,
+  author =       "G. Kitagawa",
+  title =        "Non-{Gaussian} State-Space Modeling on Nonstationary
+                 Time Series",
+  journal =      "Journal of the American Statistical Association",
+  volume =       "82",
+  number =       "400",
+  pages =        "1032--1063",
+  year =         "1987",
+}
+
+@Article{kitagawa96,
+  author =       "G. Kitagawa",
+  title =        "{Monte} {Carlo} Filter and Smoother for Non-{Gaussian}
+                 Nonlinear State Space Models",
+  journal =      "Journal of Computational Graphics and Statistics",
+  volume =       "5",
+  number =       "1",
+  pages =        "1--25",
+  year =         "1996",
+}
+
+@Article{Kivinen02,
+  author =       "J. Kivinen and A. Smola and R. Williamson",
+  title =        "Online Learning with kernels",
+  year =         "2002",
+  URL =          "citeseer.csail.mit.edu/kivinen02online.html",
+  text =         "J. Kivinen, A. Smola, and R. C. Williamson, (2002)
+                 Online Learning with kernels. Advances in Neural
+                 Information Processing Systems 14, Cambridge, MA: MIT
+                 Press (pp. 785-793).",
+}
+
+@InProceedings{Klatt82,
+  author =       "D. Klatt",
+  booktitle =    icassp,
+  title =        "Prediction of perceived phonetic distance from
+                 critical-band spectra: a first step",
+  pages =        "1278--1281",
+  year =         "1982",
+}
+
+@inproceedings{Kleinberg-2003,
+    author = "J. Kleinberg",
+    title = "An impossibility theorem for clustering",
+    editor =       NIPS15ed,
+    booktitle =    NIPS15,
+    publisher =    "MIT Press",
+    address =      "Cambridge, MA",
+    year =         "2003",
+}
+
+@Article{Kleinfeld86,
+  author =       "D. Kleinfeld",
+  title =        "Sequential State Generation by Model Neural Networks",
+  journal =      PNAS,
+  volume =       "83",
+  pages =        "9469--9473",
+  year =         "1986",
+}
+
+@InCollection{Kleinfeld89,
+  author =       "D. Kleinfeld and H. Sompolinsky",
+  editor =       "C. Koch and I. Segev",
+  booktitle =    "Methods in Neuronal Modeling: From Synapses to
+                 Networks",
+  title =        "Associative Network Models for Central Pattern
+                 Generators",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  pages =        "195--246",
+  year =         "1989",
+}
+
+@Book{Klopf82,
+  author =       "A. H. Klopf",
+  title =        "The Hedonistic Neuron: {A} Theory of Memory, Learning,
+                 and Intelligence",
+  publisher =    "Hemisphere",
+  address =      "Washington",
+  year =         "1982",
+}
+
+@InProceedings{Kneser95,
+  author =       "Reinhard Kneser and Hermann Ney",
+  booktitle =    icassp,
+  title =        "Improved Backing-Off for {M}-Gram Language Modeling",
+  pages =        "181--184",
+  year =         "1995",
+}
+
+@Article{Koch86,
+  author =       "C. Koch and J. Marroquin and A. Yuille",
+  title =        "Analog ``Neuronal'' Networks in Early Vision",
+  journal =      PNAS,
+  volume =       "83",
+  pages =        "4263--4267",
+  year =         "1986",
+}
+
+@InProceedings{Koch88,
+  author =       "C. Koch and J. Luo and C. Mead and J. Hutchinson",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Computing Motion Using Resistive Networks",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "422--431",
+  year =         "1988",
+}
+
+@InProceedings{Kohavi95,
+  author =       "Ron Kohavi",
+  booktitle =    "Proceeding of the Fourteenth International Joint
+                 Conference on Artificial Intelligence",
+  title =        "A Study of Cross-Validation and Bootstrap for Accuracy
+                 Estimation and Model Selection",
+  publisher =    "Morgan Kaufmann",
+  pages =        "1137--1143",
+  year =         "1995",
+}
+
+@article{Kohavi+John-1997,
+        address = {Essex, UK},
+        author = {Kohavi, Ron   and John, George  H.},
+        doi = {10.1016/S0004-3702(97)00043-X},
+        issn = {0004-3702},
+        journal = {Artificial Intelligence},
+        number = {1-2},
+        pages = {273--324},
+        publisher = {Elsevier Science Publishers Ltd.},
+        title = {Wrappers for feature subset selection},
+        url = {http://portal.acm.org/citation.cfm?id=270627},
+        volume = {97},
+        year = {1997}
+}
+
+@Article{Kohonen-ieee90,
+  author =       "T. Kohonen",
+  title =        "The Self-Organizing Map",
+  journal =      ieeeproc,
+  volume =       "78",
+  number =       "9",
+  pages =        "1464--1480",
+  year =         "1990",
+  OPTnote =      "Special Issue on Neural Networks",
+}
+
+@Article{Kohonen74,
+  author =       "T. Kohonen",
+  title =        "An Adaptive Associative Memory Principle",
+  journal =      ieeetc,
+  volume =       "C-23",
+  pages =        "444--445",
+  year =         "1974",
+}
+
+@Article{Kohonen82,
+  author =       "T. Kohonen",
+  title =        "Self-Organized Formation of Topologically Correct
+                 Feature Maps",
+  journal =      biocyb,
+  volume =       "43",
+  year =         "1982",
+}
+
+@InProceedings{Kohonen84,
+  author =       "T. Kohonen and K. M{\"a}kisara and T. Saram{\"a}ki",
+  booktitle =    "Proceedings of the Seventh International Conference on
+                 Pattern Recognition",
+  title =        "Phonotopic Maps --- Insightful Representation of
+                 Phonological Features for Speech Recognition",
+  publisher =    "IEEE, New York",
+  address =      "Montreal 1984",
+  pages =        "182--185",
+  year =         "1984",
+}
+
+@TechReport{Kohonen86lvq,
+  author =       "Teuvo Kohonen",
+  title =        "Learning Vector Quantization for Pattern Recognition",
+  type =         "Report",
+  number =       "TKK-F-A601",
+  institution =  "Helsinki University of Technology",
+  address =      "Espoo, Finland",
+  year =         "1986",
+}
+
+@InProceedings{Kohonen88,
+  author =       "T. Kohonen and G. Barna and R. Chrisley",
+  booktitle =    icnn,
+  title =        "Statistical Pattern Recognition with Neural Networks:
+                 Benchmarking Studies",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "61--68",
+  year =         "1988",
+}
+
+@Book{Kohonen89,
+  author =       "T. Kohonen",
+  title =        "Self-Organization and Associative Memory",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  edition =      "3",
+  year =         "1989",
+}
+
+@Book{Kohonen-2001,
+  author =       "T. Kohonen",
+  title =        "Self-Organizing Maps",
+  publisher =    "Springer",
+  edition =      "3",
+  year =         "2001",
+}
+
+@Article{Kolchinskii2000,
+  author =       "V. Koltchinskii and E. Giné",
+  title =        "Random matrix approximation of spectra of integral
+                 operators",
+  journal =      "Bernoulli",
+  volume =       "6",
+  number =       "1",
+  pages =        "113--167",
+  year =         "2000",
+}
+
+@TechReport{Kolen+Pollack90,
+  author =       "J. F. Kolen and J. B. Pollack",
+  key =          "kolen",
+  title =        "Back propagation is sensitive to initial conditions",
+  type =         "Technical Report",
+  number =       "TR 90-{JK}-{BPSIC}",
+  institution =  "The Ohio State University",
+  year =         "1990",
+}
+
+@InProceedings{Kolen-nips94,
+  author =       "John F. Kolen",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Fool's Gold: Extracting Finite State Machines From
+                 Recurrent Network Dynamics",
+  publisher =    "Morgan Kaufmann",
+  year =         "1994",
+}
+
+@Article{Kolmogorov33,
+  author =       "A. N. Kolmogorov",
+  title =        "Sulla determinazione empirica di una leggi di
+                 distribuzione",
+  journal =      "G. Inst. Ital. Attuari",
+  volume =       "4",
+  year =         "1933",
+  note =         "translated in English in {\em Breakthroughs in
+                 Statistics}, by Kotz and Johnson (editors),
+                 Springer-Verlag, 1992",
+}
+
+@Article{Kolmogorov57,
+  author =       "A. N. Kolmogorov",
+  title =        "On the representation of continuous functions of many
+                 variables by superposition of continuous functions of
+                 one variable and addition",
+  journal =      "Kokl. Akad. Nauk USSR",
+  volume =       "114",
+  publisher =    "[translated in: American Mathematical Society
+                 Translations 28 (1963) 55--59]",
+  pages =        "953--956",
+  year =         "1957",
+}
+
+@Article{Kolmogorov65,
+  author =       "A. N. Kolmogorov",
+  title =        "Three approaches to the quantitative definition of
+                 information",
+  journal =      "Problems of Information and Transmission",
+  volume =       "1",
+  number =       "1",
+  pages =        "1--7",
+  year =         "1965",
+}
+
+@InProceedings{Koltchinskii-1998,
+  author =       "V. Koltchinskii",
+  editor =       "Eberlein and Hahn and Talagrand",
+  booktitle =    "Progress in Probability",
+  title =        "Asymptotics of Spectral Projections of Some Random
+                 Matrices Approximating Integral Operators",
+  volume =       "43",
+  publisher =    "Birkhauser",
+  address =      "Basel",
+  pages =        "191--227",
+  year =         "1998",
+}
+
+@InProceedings{Kong95,
+  author =       "Eun Bae Kong and Thomas G. Dietterich",
+  booktitle =    "International Conference on Machine Learning",
+  title =        "Error-Correcting Output Coding Corrects Bias and
+                 Variance",
+  pages =        "313--321",
+  year =         "1995",
+}
+
+@InProceedings{Konig96,
+  author =       "Y. Konig and H. Bourlard and N. Morgan",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "{REMAP}: Recursive Estimation and Maximization of {A}
+                 Posteriori Probabilities -- Application to
+                 transition-based connectionist speech recognition",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@inproceedings{Koray-08,
+ title = "Learning Invariant Features through Topographic Filter Maps",
+ author = "Kavukcuoglu, Koray and Ranzato, {Marc'Aurelio} and Fergus, Rob and {LeCun}, Yann",
+ booktitle = cvpr09,
+ publisher = "IEEE",
+ year = "2009"
+}
+
+@techreport {koray-psd-08,
+ original = "orig/koray-psd-08.pdf",
+ title = "Fast Inference in Sparse Coding Algorithms with Applications to Object Recognition",
+ author = "Kavukcuoglu, Koray and Ranzato, {Marc'Aurelio} and {LeCun}, Yann",
+ institution = "Computational and Biological Learning Lab, Courant Institute, NYU",
+ note = "Tech Report CBLL-TR-2008-12-01",
+ year = "2008"
+}
+
+@article{Kouh-Poggio-2008,
+  author = {Minjoon M. Kouh and Tomaso T. Poggio},
+  title = {A Canonical Neural Circuit for Cortical Nonlinear Operations},
+  journal = {Neural Computation},
+  volume = 20,
+  number={6},
+  pages = {1427--1451},
+  year = 2008,
+}
+
+@TechReport{Kouropteva+al-2002,
+    author =       {O. Kouropteva and O. Okun and A. Hadid and M. Soriano and S. Marcos and M. Pietik{\"a}inen},
+    title =        {Beyond locally linear embedding algorithm},
+    number =       {MVG-01-2002},
+    institution =  {Department of Electrical and Information Engineering, University of Oulu},
+    address =      {Oulu, Finland},
+    year =         2002,
+}
+
+@inproceedings{Kononenko-1994,
+    author = {Kononenko, Igor},
+    booktitle = ECML94,
+    pages = {171--182},
+    editor = {F. Bergadano and L. D. Raedt},
+    title = {Estimating Attributes: Analysis and Extensions of RELIEF},
+    url = {http://citeseer.ist.psu.edu/kononenko94estimating.html},
+    year = {1994}
+}
+
+@InProceedings{Kozma96,
+  author =       "R. Kozma and M. Kitamura and S. Sato",
+  booktitle =    nipc-hmit96,
+  title =        "Monitoring of {NPP} State using Structural Adaptation
+                 in a Neural Signal Processing System",
+  volume =       "1",
+  publisher =    ans,
+  pages =        "273--278",
+  year =         "1996",
+}
+
+@Article{Kramer1991,
+  author =       "Mark Kramer",
+  title =        "Nonlinear Principal Component Analysis Using
+                 Autoassociative Neural Network",
+  journal =      "AIChE Journal",
+  volume =       "34",
+  pages =        "233--243",
+  year =         "1991",
+}
+
+@InProceedings{Kramer89,
+  author =       "A. H. Kramer and A. Sangiovanni-Vincentelli",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Efficient Parallel Learning Algorithms for Neural
+                 Networks",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "40--48",
+  year =         "1989",
+}
+
+@Article{Krauth89,
+  author =       "W. Krauth and M. M\'ezard",
+  title =        "The Cavity Method and the Travelling-Salesman
+                 Problem",
+  journal =      eul,
+  volume =       "8",
+  pages =        "213--218",
+  year =         "1989",
+}
+
+@Book{Kreyszig90,
+  author =       "E. Kreyszig",
+  title =        "Introductory Functional Analysis with Applications",
+  publisher =    "John Wiley \& Sons, Inc.",
+  address =      "New York, NY",
+  year =         "1990",
+}
+
+@Book{Krishnaiah82,
+  editor =       "P. R. Krishnaiah and L. N. Kanal",
+  title =        "Classification, Pattern Recognition, and Reduction of
+                 Dimensionality",
+  volume =       "2",
+  publisher =    "North Holland",
+  address =      "Amsterdam",
+  year =         "1982",
+  series =       "Handbook of Statistics",
+}
+
+@techreport{KrizhevskyHinton2009,
+    author={Alex Krizhevsky and Geoffrey Hinton},
+    title = {Learning Multiple Layers of Features from Tiny Images},
+    year = 2009,
+    chapter=3,
+    institution={University of Toronto}
+}
+
+@InProceedings{Krogh-nips8,
+  author =       "A. Krogh and S. K. Riis",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Prediction of beta sheets in proteins",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "917--923",
+  year =         "1996",
+}
+
+@Article{Krogh88,
+  author =       "A. Krogh and J. A. Hertz",
+  title =        "Mean Field Analysis of Hierarchical Associative
+                 Networks with Magnetization",
+  journal =      jpa,
+  volume =       "21",
+  pages =        "2211--2224",
+  year =         "1988",
+}
+
+@InProceedings{Krogh90a,
+  author =       "A. Krogh and G. I. Thorbergsson and J. A. Hertz",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "A Cost Function for Internal Representations",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "733--740",
+  year =         "1990",
+}
+
+@InProceedings{Krogh90b,
+  author =       "A. Krogh and J. A. Hertz",
+  editor =       "R. Eckmiller and G. Hartmann and G. Hauske",
+  booktitle =    "Parallel Processing in Neural Systems and Computers",
+  title =        "Hebbian Learning of Principal Components",
+  publisher =    "Elsevier, Amsterdam",
+  address =      "D{\"u}sseldorf 1990",
+  pages =        "183--186",
+  year =         "1990",
+}
+
+@Article{Krogh94,
+  author =       "A. Krogh and M. Brown and I. S. Mian and K. Sjölander
+                 and D. Haussler",
+  title =        "Hidden Markov models in computational biology:
+                 Applications to protein modeling",
+  journal =      "Journal Molecular Biology",
+  volume =       "235",
+  pages =        "1501--1531",
+  year =         "1994",
+}
+
+@InProceedings{Krogh95,
+  author =       "A. Krogh and J. Vedelsby",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        "Neural network ensembles, cross validation and active
+                 learning",
+  publisher =    "Cambridge MA: MIT Press",
+  pages =        "231--238",
+  year =         "1995",
+}
+
+@Book{Krolzig97,
+  author =       "H.-M. Krolzig",
+  title =        "Markov-Switching Vector Autoregressions",
+  publisher =    "Springer",
+  year =         "1997",
+}
+
+@article{Krueger+Dayan-2009,
+ author = {Kai A. Krueger and Peter Dayan},
+ title = {Flexible shaping: how learning in small steps helps},
+ journal = {Cognition},
+ volume = 110,
+ year = 2009,
+ pages = {380--394},
+}
+
+@Article{Ku92,
+  author =       "C. C. Ku and K. Y. Lee and R. M. Eawards",
+  title =        "Improved Nuclear Reactor Temperature Control Using
+                 Diagonal Recurrent Neural Networks",
+  journal =      "IEEE Transactions on Nuclear Science",
+  volume =       "39",
+  pages =        "2292--2308",
+  year =         "1992",
+}
+
+@InProceedings{Kubala94,
+  author =       "F. Kubala and A. Anastasakos and J. Makhoul and L.
+                 Nguyen and R. Schwartz and G. Zavaliagkos",
+  booktitle =    icassp,
+  title =        "Comparative experiments on large vocabulary speech
+                 recognition",
+  address =      "Adelaide, Australia",
+  pages =        "561--564",
+  year =         "1994",
+}
+
+@InProceedings{Kuhn+Herzberg90,
+  author =       "G. Kuhn and N. Herzberg",
+  booktitle =    "Proc. 24th Conference on Information Sciences and
+                 Systems",
+  title =        "Variations on training of recurrent networks",
+  organization = "Princeton University",
+  address =      "NJ",
+  year =         "1990",
+}
+
+@Unpublished{Kuhn87,
+  author =       "G. Kuhn",
+  title =        "A first look at phonetic discrimination using
+                 connectionist models with recurrent links",
+  year =         "1987",
+  note =         "CCRP -- IDA SCIMP working paper No.4/87, Institute for
+                 Defense Analysis, Princeton, NJ",
+}
+
+@Article{Kuhn-et-al-90,
+  author =       "G. Kuhn and R. L. Watrous and B. Ladendorf",
+  title =        "Connected recognition with a recurrent network",
+  journal =      spcomm,
+  volume =       "9",
+  pages =        "41--49",
+  year =         "1990",
+  OPTnote =      "",
+}
+
+@Book{Kullback59,
+  author =       "S. Kullback",
+  title =        "Information Theory and Statistics",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1959",
+}
+
+@Book{Kumar+al-1994,
+  author =       "V. Kumar and A. Grama and A. Gupta and G. Karypis",
+  title =        "Introduction to Parallel Computing: Design and
+                 Analysis of Algorithms",
+  publisher =    "Benjamin Cummings",
+  address =      "Redwood City, CA",
+  year =         "1994",
+}
+
+@Article{Kumar+al-1994b,
+  author =       "Vipin Kumar and Shashi Shekhar and Minesh B. Amin",
+  title =        "A Scalable Parallel Formulation of the Backpropagation
+                 Algorithm for Hypercubes and Related Architectures",
+  journal =      "IEEE Transactions on Parallel and Distributed
+                 Systems",
+  volume =       "5",
+  number =       "10",
+  pages =        "1073--1090",
+  year =         "1994",
+}
+
+@InProceedings{Kundu88,
+  author =       "A. Kundu and L. R. Bahl",
+  booktitle =    icassp,
+  title =        "Recognition of handwritten script: a hidden {Markov}
+                 model based approach",
+  address =      "New York, NY",
+  pages =        "928--931",
+  year =         "1988",
+}
+
+@Article{Kuperstein88,
+  author =       "M. Kuperstein",
+  title =        "Neural model of adaptive hand-eye coordination for
+                 single postures",
+  journal =      "Science",
+  volume =       "239",
+  pages =        "1308--1311",
+  year =         "1988",
+}
+
+@Article{Kurkova95,
+  author =       "V. Kurkov\'a",
+  title =        "Approximation of functions by perceptron networks with
+                 bounded number of hidden units",
+  journal =      "Neural Networks",
+  volume =       "8",
+  pages =        "745--750",
+  year =         "1995",
+}
+
+@Book{Kushner78,
+  author =       "H. J. Kushner and D. S. Clark",
+  title =        "Stochastic Approximation Methods for Constrained and
+                 Unconstrained Systems",
+  publisher =    "Springer-Verlag",
+  address =      "New York",
+  year =         "1978",
+}
+
+@InProceedings{Kwok-Tsang-2003,
+  author =       "J. T. Kwok and I. W. Tsang",
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  title =        "Learning with idealized kernels",
+  pages =        "400--407",
+  year =         "2003",
+}
+
+@InProceedings{Laaksonen97,
+  author =       "Jorma Laaksonen",
+  booktitle =    "Proceedngs of the International Conference on
+                 Artificial Neural Networks ICANN'97",
+  title =        "Local Subspace Classifier",
+  pages =        "637--642",
+  year =         "1997",
+  URL =          "http://www.cis.hut.fi/jorma/papers/abstracts.html#icann97",
+}
+
+@InProceedings{Lafferty-icml2001,
+  author =       "John Lafferty and Andrew McCallum and Fernando C. N. Pereira",
+  booktitle =    ICML01,
+  editor =       ICML01ed,
+  publisher =    ICML01publ,
+  title =        "Conditional Random Fields: Probabilistic Models for
+                 Segmenting and Labeling Sequence Data",
+  year =         "2001",
+}
+
+@article{Lai+Fyfe-2000,
+    author = {P. L. Lai and C. Fyfe},
+    title = {Kernel and Nonlinear Canonical Correlation Analysis},
+    journal = {International Journal of Neural Systems},
+    year = {2000},
+    pages = {365--377},
+    volume = 10,
+    number = 5,
+}
+
+@InProceedings{Laj92,
+  author =       "E. Laj and A. Paoloni",
+  editor =       "M. Gori",
+  booktitle =    "Proc. of the Second Workshop on Neural Networks for
+                 Speech Processing",
+  title =        "{AIDA}: The Italian Corpora",
+  publisher =    "LINT",
+  address =      "Firenze (Italy)",
+  pages =        "179--183",
+  year =         "1992",
+}
+
+@InProceedings{Lanckriet-2002,
+  author =       "G. Lanckriet and N. Cristianini and P. Bartlett and L.
+                 {El Gahoui} and M. Jordan",
+  booktitle =    ICML02,
+  editor =       ICML02ed,
+  publisher =    ICML02publ,
+  title =        "Learning the kernel matrix with semi-definite
+                 programming",
+  pages =        "323--330",
+  year =         "2002",
+}
+
+@Article{Lanckriet2004,
+  author =       "Gert R. G. Lanckriet and Nello Cristianini and Peter
+                 Bartlett and Laurent El Ghaoui and Michael I. Jordan",
+  title =        "Learning the Kernel Matrix with Semidefinite
+                 Programming",
+  journal =      jmlr,
+  volume =       "5",
+  pages =        "27--72",
+  year =         "2004",
+}
+
+@TechReport{Lang+Hinton88,
+  author =       "K. J. Lang and G. E. Hinton",
+  title =        "The development of the Time-Delay Neural Network
+                 architecture for speech recognition",
+  number =       "CMU-CS-88-152",
+  institution =  "Carnegie-Mellon University",
+  year =         "1988",
+}
+
+@Article{Langdell-00-nips,
+  author =       "S. Langdell and Y. Bengio",
+  title =        "Approximate {SVM} Solutions: a Datamining Tool",
+  journal =      "submitted to NIPS'2000",
+  year =         "2000",
+}
+
+@InProceedings{Langford+Zadrozny-2005,
+  author =       "John Langford and Bianca Zadrozny",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "Estimating Class Membership Probabilities using
+                 Classifier Learners",
+  publisher =    "Society for Artificial Intelligence and Statistics",
+  pages =        "198--205",
+  year =         "2005",
+}
+
+@Article{Lapedes86a,
+  author =       "A. Lapedes and R. Farber",
+  title =        "A Self-Optimizing, Nonsymmetrical Neural Net for
+                 Content Addressable Memory and Pattern Recognition",
+  journal =      physicaD,
+  volume =       "22",
+  pages =        "247--259",
+  year =         "1986",
+}
+
+@InProceedings{Lapedes86b,
+  author =       "A. Lapedes and R. Farber",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "Programming a Massively Parallel, Computation
+                 Universal System: Static Behavior",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "283--298",
+  year =         "1986",
+}
+
+@TechReport{Lapedes87,
+  author =       "A. Lapedes and R. Farber",
+  title =        "Nonlinear Signal Processing Using Neural Networks:
+                 Prediction and System Modelling",
+  number =       "LA--UR--87--2662",
+  institution =  "Los Alamos National Laboratory",
+  address =      "Los Alamos, NM",
+  year =         "1987",
+}
+
+@InProceedings{Lapedes88,
+  author =       "A. Lapedes and R. Farber",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "How Neural Nets Work",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "442--456",
+  year =         "1988",
+}
+
+@Article{Lari90,
+  author =       "K. Lari and S. J. Young",
+  title =        "The estimation of stochastic context-free grammars
+                 using the Inside-Outside algorithm",
+  journal =      cspla,
+  volume =       "4",
+  pages =        "35--56",
+  year =         "1990",
+}
+
+@inproceedings{Tieleman08,
+    author = {Tijmen Tieleman},
+    title = {Training restricted Boltzmann machines using approximations to the likelihood gradient},
+    booktitle = ICML08,
+    editor =    ICML08ed,
+    publisher = ICML08publ,
+    location = {Helsinki, Finland},
+    year = {2008},
+    pages = {1064--1071}
+}
+
+@InProceedings{TielemanT2009,
+ author =    {Tijmen Tieleman and Geoffrey Hinton},
+ title =     {Using Fast Weights to Improve Persistent Contrastive Divergence},
+ booktitle = ICML09,
+ editor =    ICML09ed,
+ publisher = ICML09publ,
+ year =      "2009",
+ isbn =      {978-1-60558-516-1},
+ pages =     {1033--1040},
+ location =  icml09loc,
+ doi =       {http://doi.acm.org/10.1145/1553374.1553506},
+}
+
+@article{Larochelle-jmlr-toappear-2008,
+ author = {Hugo Larochelle and Yoshua Bengio and Jerome Louradour and Pascal Lamblin},
+ title = {Exploring Strategies for Training Deep Neural Networks},
+ journal = jmlr,
+ year = 2009,
+ volume = 10,
+ pages = {1--40},
+}
+
+@InProceedings{LarochelleH2007-small,
+  author =       "H. Larochelle and D. Erhan and A. Courville and
+                 J. Bergstra and Y. Bengio",
+  booktitle =    "ICML 2007",
+  title =        "An Empirical Evaluation of Deep Architectures on
+                 Problems with Many Factors of Variation",
+  year =         "2007",
+}
+
+@InProceedings{LarochelleH2007-short,
+  author =       "H. Larochelle and D. Erhan and A. Courville and
+                 J. Bergstra and Y. Bengio",
+  booktitle =    "Int. Conf. Mach. Learn.",
+  title =        "An Empirical Evaluation of Deep Architectures on
+                 Problems with Many Factors of Variation",
+  year =         "2007",
+  pages =        "473--480",
+}
+
+%I deprecate the following one as this is a duplicate of LarochelleH2007
+@InProceedings{larochelle-icml-2007,
+  author =       "Hugo Larochelle and Dumitru Erhan and Aaron Courville
+                 and James Bergstra and Yoshua Bengio",
+  booktitle =    ICML07,
+  editor =       ICML07ed,
+  publisher =    ICML07publ,
+  title =        "An Empirical Evaluation of Deep Architectures on
+                 Problems with Many Factors of Variation",
+  pages =        "473--480",
+  location =     "Corvallis, OR",
+  year =         "2007",
+}
+  %url =          "http://www.machinelearning.org/proceedings/icml2007/papers/331.pdf",
+
+%I deprecate the following one as this is a duplicate of LarochelleH2007
+@Article{larochelle:icml07,
+  author =       "Hugo Larochelle and Dumitru Erhan and Aaron Courville and
+                 James Bergstra and Yoshua Bengio",
+  booktitle =    ICML07,
+  editor =       ICML07ed,
+  publisher =    ICML07publ,
+  title =        "An empirical evaluation of deep architectures on
+                 problems with many factors of variation",
+  pages =        "473--480",
+  year =         "2007",
+  location =     "Corvallis, OR",
+  url =          "http://www.machinelearning.org/proceedings/icml2007/papers/331.pdf",
+}
+
+@inproceedings{Larochelle+Bengio-2008-small,
+    author = "Hugo Larochelle and Yoshua Bengio",
+    title = {Classification using Discriminative Restricted {Boltzmann} Machines},
+    booktitle = {Proceedings of ICML 2008},
+    year = {2008},
+    pages = {536--543}
+}
+
+@InCollection{Larsen98,
+  author =       "Jan Larsen and Claus Svarer and Lars Nonboe Andersen
+                 and Lars Kai Hansen",
+  editor =       "G. B. Orr and K-R. Muller",
+  booktitle =    "Neural Networks: Tricks of he Trade",
+  title =        "Adaptive Regularization in Neural Networks Modeling",
+  publisher =    "Springer",
+  pages =        "113--132",
+  year =         "1998",
+}
+
+ 
+@InProceedings{LasserreJ2006,
+  author =       "Julia A. Lasserre and Christopher M. Bishop and
+                 Thomas P. Minka",
+  booktitle =    cvpr06,
+  title =        "Principled Hybrids of Generative and Discriminative
+                 Models",
+  publisher =    "IEEE Computer Society",
+  address =      "Washington, DC, USA",
+  pages =        "87--94",
+  year =         "2006",
+  ISBN =         "0-7695-2597-0",
+  doi =          "http://dx.doi.org/10.1109/CVPR.2006.227",
+}
+
+
+@TechReport{Laub2003,
+  author =       "J. Laub and K.-R. M{\"u}ller",
+  title =        "Feature discovery: unraveling hidden structure in
+                 non-metric pairwise data",
+  institution =  "Fraunhofer FIRST.IDA",
+  address =      "Germany",
+  year =         "2003",
+}
+
+@Article{Lauritzen95,
+  author =       "Steffen L. Lauritzen",
+  title =        "The {EM} algorithm for graphical association models
+                 with missing data",
+  journal =      "Computational Statistics and Data Analysis",
+  volume =       "19",
+  pages =        "191--201",
+  year =         "1995",
+}
+
+@Book{Lauritzen96,
+  author =       "Steffen L. Lauritzen",
+  title =        "Graphical Models",
+  publisher =    "Clarendon Press",
+  address =      "Oxford",
+  year =         "1996",
+  ISBN =         "0-19-852219-3",
+}
+
+@Book{Lawler76,
+  author =       "E. L. Lawler",
+  title =        "Combinatorial Optimization: Networks and Matroids",
+  publisher =    "Holt-Rinehart-Winston",
+  address =      "New York",
+  year =         "1976",
+}
+
+@Book{Lawler85,
+  editor =       "E. L. Lawler and J. K. Lenstra and A. H. G. Rinnooy
+                 Kan and D. B. Shmoys",
+  title =        "The Travelling Salesman Problem",
+  publisher =    "Wiley",
+  address =      "Chichester",
+  year =         "1985",
+}
+
+@InProceedings{Lawrence-Seeger-Herbrich-2003,
+  author =       "Neil Lawrence and Matthias Seeger and Ralf Herbrich",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Fast Sparse {G}aussian Process Methods: The Informative
+                 Vector Machine",
+  publisher =    "{MIT} Press",
+  pages =        "609--616",
+  year =         "2003",
+}
+
+@InCollection{Lawrence00,
+  author =       "S. Lawrence and S. Fong and C. L. Giles",
+  title =        "Natural Language Grammatical Inference with Recurrent
+                 Neural Networks",
+  journal =      "IEEE Trans. on Knowledge and Data Engineering",
+  pages =        "",
+  year =         "2000",
+}
+
+@InCollection{Lawrence96,
+  author =       "S. Lawrence and S. Fong and C. L. Giles",
+  editor =       "S. Wermter and E. Riloff and G. Scheler",
+  booktitle =    "Lecture Notes on Artificial Intelligence,
+                 Connectionist, Statistical and Symbolic Approaches to
+                 Learning for Natural Language Processing",
+  title =        "Natural Language Grammatical Inference: {A} Comparison
+                 of Recurrent Neural Networks and Machine Learning
+                 Methods",
+  publisher =    "Springer-Verlag, NY",
+  year =         "1996",
+}
+
+@InCollection{LawrenceN2005,
+  author =       "Neil D. {Lawrence} and Michael I. {Jordan}",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Semi-supervised Learning via {G}aussian Processes",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "753--760",
+  year =         "2005",
+  original =     "0753-257.PDF",
+}
+
+@TechReport{LeBaron95,
+  author =       "B. LeBaron and A. S. Weigend",
+  title =        "Evaluating Neural Network Predictors by
+                 Bootstrapping",
+  number =       "CU-CS-725-94",
+  institution =  "University of Colorado, Boulder",
+  year =         "1995",
+}
+
+@Article{LeCun+98,
+  author =       "Yann {LeCun} and Leon Bottou and Yoshua Bengio and
+                 Patrick Haffner",
+  title =        "Gradient-Based Learning Applied to Document
+                 Recognition",
+  journal =      "Proceedings of the {IEEE}",
+  volume =       "86",
+  number =       "11",
+  pages =        "2278--2324",
+  month =        nov,
+  year =         "1998",
+}
+
+@InCollection{LeCun+98backprop,
+  author =       "Yann {LeCun} and L\'{e}on Bottou and Genevieve B. Orr
+                 and Klaus-Robert M{\"{u}}ller",
+  title =        "Efficient Backprop",
+  booktitle =    "Neural Networks, Tricks of the Trade",
+  series =       "Lecture Notes in Computer Science LNCS~1524",
+  publisher =    "Springer Verlag",
+  year =         "1998",
+}
+  %URL =          "http://leon.bottou.org/papers/lecun-98x",
+
+
+@InCollection{LeCun+98backprop-small,
+  author =       "Y. {LeCun} and L. Bottou and G. B. Orr
+                 and K. M{\"{u}}ller",
+  title =        "Efficient Backprop",
+  booktitle =    "Neural Networks, Tricks of the Trade",
+  year =         "1998",
+}
+
+
+@InProceedings{lecun-04,
+  author =       "Yann {LeCun} and Fu-Jie Huang and L{\'e}on Bottou",
+  booktitle =    cvpr04,
+  title =        "Learning Methods for Generic Object Recognition with
+                 Invariance to Pose and Lighting",
+  volume = {2},
+  year =         "2004",
+  issn = {1063-6919},
+  pages = {97-104},
+  doi = {http://doi.ieeecomputersociety.org/10.1109/CVPR.2004.144},
+  publisher = {IEEE Computer Society},
+  address = {Los Alamitos, CA, USA},
+}
+
+@InProceedings{LeCun-cp89,
+  author =       "Yann {LeCun}",
+  booktitle =    "Connectionism in Perspective",
+  title =        "Generalization and Network Design Strategies",
+  publisher =    "Elsevier Publishers",
+  year =         "1989",
+}
+
+@InCollection{LeCun-dsbo86,
+  author =       "Yann {LeCun}",
+  editor =       "F. Fogelman-Souli\'e and E. Bienenstock and G.
+                 Weisbuch",
+  booktitle =    "Disordered Systems and Biological Organization",
+  title =        "Learning Processes in an Asymmetric Threshold
+                 Network",
+  publisher =    "Springer-Verlag",
+  address =      "Les Houches, France",
+  pages =        "233--240",
+  year =         "1986",
+}
+
+@InProceedings{lecun-huang-05,
+  author =       "Yann {LeCun} and {Fu Jie} Huang",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "Loss Functions for Discriminative Training of
+                 Energy-Based Models",
+  date =         "Jan 6-8, 2005",
+  location =     "Savannah Hotel, Barbados",
+  year =         "2005",
+}
+
+@Misc{LeCun-nips93-tutorial,
+  author =       "Yann {LeCun}",
+  title =        "Efficient learning and second-order methods",
+  year =         "1993",
+  note =         "Tutorial presented at NIPS'93, Denver, CO",
+}
+
+@PhdThesis{Lecun-these87,
+  author =       "Yann {LeCun}",
+  title =        "Mod\`eles connexionistes de l'apprentissage",
+  school =       "Universit\'e de Paris VI",
+  year =         "1987",
+}
+
+@InCollection{lecun2006,
+  author =       "Yann {LeCun} and Sumit Chopra and Raia Hadsell and
+                 Marc-Aurelio Ranzato and Fu-Jie Huang",
+  editor =       "G. Bakir and T. Hofman and B. Scholkopf and A. Smola
+                 and B. Taskar",
+  booktitle =    "Predicting Structured Data",
+  title =        "A Tutorial on Energy-Based Learning",
+  publisher =    "MIT Press",
+  pages =        "191--246",
+  year =         "2006",
+}
+
+@InProceedings{LeCun85,
+  author =       "Yann {LeCun}",
+  booktitle =    "Cognitiva 85: A la Fronti\`ere de l'Intelligence
+                 Artificielle, des Sciences de la Connaissance et des
+                 Neurosciences",
+  title =        "Une Proc\'edure d'Apprentissage pour {R}\'eseau \`a
+                 Seuil Assym\'etrique",
+  publisher =    "CESTA, Paris",
+  address =      "Paris 1985",
+  pages =        "599--604",
+  year =         "1985",
+}
+
+@InCollection{LeCun86,
+  author =       "Yann {LeCun}",
+  editor =       "E. Bienenstock and F. Fogelman-Souli\'e and G.
+                 Weisbuch",
+  booktitle =    "Disordered Systems and Biological Organization",
+  title =        "Learning Processes in an Asymmetric Threshold
+                 Network",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Les Houches 1985",
+  pages =        "233--240",
+  year =         "1986",
+}
+
+@Article{LeCun89,
+  author =       "Yann {LeCun} and Bernhard Boser and John S. Denker and Donnie
+                 Henderson and Richard E. Howard and Wayne Hubbard and Lawrence D.
+                 Jackel",
+  title =        "Backpropagation Applied to Handwritten Zip Code
+                 Recognition",
+  journal =      nc,
+  volume =       "1",
+  number =       "4",
+  pages =        "541--551",
+  year =         "1989",
+}
+
+@TechReport{LeCun89a,
+  author =       "Yann {LeCun}",
+  key =          "LeCun",
+  title =        "Generalization and Network Design Strategies",
+  type =         "Technical Report",
+  number =       "CRG-TR-89-4",
+  institution =  "University of Toronto",
+  year =         "1989",
+}
+
+@Article{LeCun89d,
+  author =       "Yann {LeCun} and Lawrence D. Jackel and B. Boser and J.
+                 S. Denker and Hans P. Graf and I. Guyon and D.
+                 Henderson and R. E. Howard and W. Hubbard",
+  title =        "Handwritten Digit recognition: Applications of Neural
+                 Network Chips and Automatic Learning",
+  journal =      "IEEE Communications Magazine",
+  volume =       "27",
+  number =       "11",
+  pages =        "41--46",
+  month =        nov,
+  year =         "1989",
+}
+
+@InProceedings{LeCun90a,
+  author =       "Y. {LeCun} and B. Boser and J. S. Denker and D.
+                 Henderson and R. E. Howard and W. Hubbard and L. D.
+                 Jackel",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Handwritten Digit Recognition with a Back-Propagation
+                 Network",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "396--404",
+  year =         "1990",
+}
+
+@InProceedings{LeCun90b,
+  author =       "Y. {LeCun} and J. S. Denker and S. A. Solla",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Optimal Brain Damage",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "598--605",
+  year =         "1990",
+}
+
+@InProceedings{LeCun90c,
+  author =       "Y. LeCun and Y. Matan and B. Boser and J. S. Denker
+                 and D. Henderson and R. E. Howard and W. Hubbard and L.
+                 D. Jackel and H. S. Baird",
+  editor =       "IAPR",
+  booktitle =    "International Conference on Pattern Recognition",
+  title =        "Handwritten Zip Code Recognition with Multilayer
+                 Networks",
+  publisher =    "IEEE",
+  address =      "Atlantic City",
+  year =         "1990",
+}
+
+@InProceedings{LeCun91,
+  author =       "Y. {LeCun} and I. Kanter and S. Solla",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "Second order properties of error surfaces: learning
+                 time, generalization",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "918--924",
+  year =         "1991",
+}
+
+@InCollection{LeCun93,
+  author =       "Y. {LeCun} and P. Simard and B. Pearlmutter",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Automatic learning rate maximization by on-line
+                 estimation of the {Hessian}'s eigenvectors",
+  publisher =    "Morgan Kaufmann Publishers, San Mateo, CA",
+  pages =        "156--163",
+  year =         "1993",
+}
+
+@InProceedings{LeCun94b,
+  author =       "Yann LeCun and Yoshua Bengio",
+  editor =       "IEEE",
+  booktitle =    ICPR94,
+  title =        "Word-Level Training of a Handritten Word Recognizer
+                 based on Convolutional Neural Networks",
+  address =      "Jerusalem 1994",
+  year =         "1994",
+}
+
+@Article{LeCun98-small,
+  author =       "Y. {LeCun} and L. Bottou and Y. Bengio and
+                 P. Haffner",
+  title =        "Gradient Based Learning Applied to Document
+                 Recognition",
+  journal =      "IEEE",
+  volume =       "86",
+  number =       "11",
+  pages =        "2278--2324",
+  month =        nov,
+  year =         "1998",
+}
+
+@InCollection{LeCun98-tricks,
+  author =       "Y. {LeCun} and L. Bottou and G. B. Orr and K.-R.
+                 M{\"u}ller",
+  editor =       "G. B. Orr and K.-R. M{\"u}ller",
+  booktitle =    "Neural Networks: Tricks of the Trade",
+  title =        "Efficient {BackProp}",
+  publisher =    "Springer",
+  pages =        "9--50",
+  year =         "1998",
+}
+
+@TechReport{LeCun-TR,
+  author =       "Yann {LeCun}",
+  key =          "Lecun",
+  title =        "Generalization and Network Design Strategies",
+  number =       "CRG-TR-89-4",
+  institution =  "Department of Computer Science, University of
+                 Toronto",
+  year =         "1989",
+}
+
+@Article{Lee+Hon89,
+  author =       "Kai-Fu Lee and Hsiao-Wuen Hon",
+  title =        "Speaker-independent phone recognition using hidden
+                 {Markov} models",
+  journal =      "IEEE Trans. on Acoustics, Speech and Signal
+                 Processing",
+  volume =       "37",
+  number =       "11",
+  pages =        "1641--1648",
+  month =        nov,
+  year =         "1989",
+}
+
+@Article{Lee+Lewicki-2002,
+  author =       "T-W. Lee and M. S. Lewicki",
+  title =        "Unsupervised classification segmentation and
+                 enhancement of images using {ICA} mixture models",
+  journal =      "IEEE Trans. Image Proc.",
+  volume =       "11",
+  number =       "3",
+  pages =        "270--279",
+  year =         "2002",
+}
+
+@InCollection{Lee-2008,
+  author =       "Honglak Lee and Chaitanya Ekanadham and Andrew Ng",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Sparse deep belief net model for visual area {V}2",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages = {873--880},
+  year =         "2008",
+}
+
+@Book{Lee91,
+  author =       "Kai-Fu Lee",
+  title =        "Automatic Speech Recognition: the development of the
+                 {SPHINX} system",
+  publisher =    "Kluwer Academic Publ.",
+  year =         "1989",
+}
+
+@article{Lee-1996,
+    author = "Tai Sing Lee",
+    title = "Image Representation Using {2D} {Gabor} Wavelets",
+    journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence",
+    volume = "18",
+    number = "10",
+    pages = "959-971",
+    year = "1996",
+}
+
+@InProceedings{Lee99a,
+  author =       "Lillian Lee",
+  booktitle =    "ACL99",
+  title =        "Measures of Distributional Similarity",
+  pages =        "25--32",
+}
+
+@InProceedings{Lee99b,
+  author =       "Lillian Lee and Fernando Pereira",
+  title =        "Distributional Similarity Models: Clustering vs.
+                 Nearest Neighbours",
+  booktitle =    "ACL99",
+  pages =        "33--40",
+}
+
+@article{Lee+Mumford-2003,
+ author = {Tai-Sing Lee and David Mumford},
+ title = {Hierarchical Bayesian inference in the visual cortex},
+ year = 2003,  
+ journal = {Journal of Optical Society of America, A},
+ volume = 20,
+ number = 7,
+ pages = {1434--1448},
+}
+
+
+@Article{Leitch91,
+  author =       "G. Leitch and J. E. Tanner",
+  title =        "Economic Forecast Evaluation: Profits Versus The
+                 Conventional Error Measures",
+  journal =      "The American Economic Review",
+  pages =        "580--590",
+  year =         "1991",
+}
+
+@Article{Lengelle+Denoeux96,
+  author =       "R{\'e}gis Lengell{\'e} and Thierry Denoeux",
+  title =        "Training {MLP}s layer by layer using an objective
+                 function for internal representations",
+  journal =      "Neural Networks",
+  volume =       "9",
+  pages =        "83--97",
+  year =         "1996",
+}
+
+@InProceedings{Leprieur95,
+  author =       "H. Leprieur and P. Haffner",
+  booktitle =    "EUROSPEECH'95",
+  title =        "Discriminant learning with minimum memory loss for
+                 improved non-vocabulary rejection",
+  address =      "Madrid, Spain",
+  year =         "1995",
+}
+
+@Book{lerdahl+jackendoff-1983,
+  author =       "F. Lerdahl and R. Jackendoff",
+  title =        "A {Generative} {Theory} of {Tonal} {Music}",
+  publisher =    "MIT Press",
+  address =      "Cambridge, Mass.",
+  year =         "1983",
+}
+
+@InCollection{LeRoux+al-tonga-2008,
+  author =       "Nicolas {Le Roux} and Pierre-Antoine Manzagol and
+                 Yoshua Bengio",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Topmoumoute online natural gradient algorithm",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =        "849--856",
+  year =         "2008",
+}
+
+@InCollection{LeRoux+al-tonga-2008-small,
+  author =       "Nicolas {Le Roux} and Pierre-Antoine Manzagol and
+                 Yoshua Bengio",
+  booktitle =    "NIPS 20",
+  title =        "Topmoumoute online natural gradient algorithm",
+  pages =        "849--856",
+  year =         "2008",
+}
+
+@TechReport{LeRoux-comb-dens-2005,
+  author =       "Nicolas {Le Roux} and Yoshua Bengio and R\'ejean
+                 Ducharme",
+  title =        "Combining density estimators to improve classification
+                 accuracy",
+  number =       "1261",
+  institution =  "D\'epartement d'informatique et recherche
+                 op\'erationnelle, Universit\'e de Montr\'eal",
+  year =         "2005",
+}
+
+@InProceedings{LeRoux-continuous-short,
+  author =       "Nicolas Le Roux and Yoshua Bengio",
+  booktitle =    aistats07,
+  title =        "Continuous Neural Networks",
+  year =         "2007",
+  date =         "March 21-24, 2007",
+}
+
+@InProceedings{Lesk1986,
+  author =       "Michael E. Lesk",
+  booktitle =    "SIGDOC Conference",
+  title =        "Automatic sense disambiguation using machine readable
+                 dictionaries: How to tell a pine cone from an ice cream
+                 cone.",
+  address =      "Toronto, Canada",
+  year =         "1980",
+}
+
+@InProceedings{Leung92,
+  author =       "H. C. Leung and I. L. Hetherington and V. W. Zue",
+  booktitle =    icassp,
+  title =        "Speech recognition using stochastic segment neural
+                 networks",
+  volume =       "1",
+  institution =  "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
+  publisher =    "IEEE",
+  address =      "New York, NY, USA",
+  pages =        "613--16",
+  year =         "1992",
+}
+
+@Article{Levenberg44,
+  author =       "K. Levenberg",
+  title =        "A method for the solution of certain non-linear
+                 problems in least squares",
+  journal =      "Quarterly Journal of Applied Mathematics",
+  volume =       "II",
+  number =       "2",
+  pages =        "164--168",
+  year =         "1944",
+}
+
+@InProceedings{Levin90,
+  author =       "E. Levin",
+  booktitle =    icassp,
+  title =        "Word Recognition using Hidden Control Neural
+                 Architecture",
+  address =      "Albuquerque, NM",
+  pages =        "433--436",
+  year =         "1990",
+}
+
+@InProceedings{Levin92,
+  author =       "E. Levin and R. Pieraccini and E. Bocchieri",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Time-Warping Network: a Hybrid Framework for Speech
+                 Recognition",
+  address =      "Denver, CO",
+  pages =        "151--158",
+  year =         "1992",
+}
+
+@Article{Levinson83,
+  author =       "S. E. Levinson and L. R. Rabiner and M. M. Sondhi",
+  title =        "An Introduction to the Application of the Theory of
+                 Probabilistic Functions of a {Markov} Process to
+                 Automatic Speech Recognition",
+  journal =      "Bell System Technical Journal",
+  volume =       "64",
+  number =       "4",
+  pages =        "1035--1074",
+  year =         "1983",
+}
+
+@InCollection{Levinson96,
+  author =       "S. E. Levinson",
+  editor =       "R. A. Cole and J. Mariani and H. Uszkoriet and A.
+                 Zaenen and V. Zue",
+  booktitle =    "Survey of the State of the Art in Human Language
+                 Technology",
+  title =        "Statistical Modeling and Classification",
+  publisher =    "Cambridge University Press",
+  address =      "http://www.cse.ogi.edu/CSLU/HLTsurvey/HLTsurvey.html",
+  pages =        "395--401",
+  year =         "1996",
+}
+
+@phdthesis{Levner2008,
+  author = {Ilya Levner},
+  title = {Data Driven Object Segmentation},
+  school = {Department of Computer Science, University of Alberta},
+  year = 2008,
+}
+
+@InProceedings{Lewicki+Sejnowski-97,
+  author =       "Michael Lewicki and Terry Sejnowski",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "Learning nonlinear overcomplete representations for
+                 efficient coding",
+  publisher =    "MIT Press",
+  isbn = {0-262-10076-2},
+  location = {Denver, Colorado, United States},
+  address = {Cambridge, MA, USA},
+  pages =        "556--562",
+  year =         "1998",
+}
+
+@article{Lewicki+Sejnowski-2000,
+    author = {Michael S. Lewicki and Terrence J. Sejnowski},
+    title = {Learning Overcomplete Representations},
+    journal = {Neural Computation},
+    volume = {12},
+    number = {2},
+    year = {2000},
+    issn = {0899-7667},
+    pages = {337--365},
+    doi = {http://dx.doi.org/10.1162/089976600300015826},
+    publisher = {MIT Press},
+    address = {Cambridge, MA, USA},
+}
+
+@InProceedings{LewisC62,
+    author =       "P. M. {Lewis II} and C. L. Coates",
+    title =        "A realization procedure for threshold gate networks",
+    crossref =     "FOCS3",
+    pages =        "159--168",
+    url =          "http://theory.lcs.mit.edu/~dmjones/FOCS/focs.bib",
+}
+
+@Article{lheureux-04-small,
+  author =       "P.-J. {L'Heureux} and J. Carreau and Y. Bengio and O.
+                 Delalleau and S. Y. Yue",
+  title =        "Locally Linear Embedding for dimensionality reduction
+                 in {QSAR}",
+  journal =      "J. Computer-Aided Molecular Design",
+  pages =        "18.475",
+  year =         "2004",
+}
+
+@Book{Li93,
+  author =       "Ming Li and Paul Vitanyi",
+  title =        "An Introduction to Kolmogorov Complexity and Its
+                 Applications",
+  publisher =    "Second edition, Springer",
+  address =      "New York, NY",
+  year =         "1997",
+}
+
+@Article{li99face,
+  author =       "S. Z. Li and J. W. Lu",
+  title =        "Face recognition using the nearest feature line
+                 method",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "10",
+  number =       "2",
+  pages =        "439--443",
+  year =         "1999",
+  URL =          "citeseer.nj.nec.com/li99face.html",
+}
+
+@inproceedings{Li+al-2005,
+    author    = {Hongyu Li and Wenbin Chen and I-Fan Shen},
+    title     = {Supervised Local Tangent Space Alignment for Classification},
+    booktitle = {IJCAI},
+    year      = {2005},
+    pages     = {1620-1621},
+    ee        = {http://www.ijcai.org/papers/post-0505.pdf},
+    bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@article{Li+Guo-2006,
+    author = {Chun-Guang Li and Jun Guo},
+    title = {Supervised Isomap with Explicit Mapping},
+    journal = {First International Conference on Innovative Computing, Information and Control},
+    volume = {3},
+    year = {2006},
+    isbn = {0-7695-2616-0},
+    pages = {345-348},
+    doi = {http://doi.ieeecomputersociety.org/10.1109/ICICIC.2006.530},
+    publisher = {IEEE Computer Society},
+    address = {Los Alamitos, CA, USA},
+}
+
+@inproceedings{lischuurmans08a,
+author = "Li, Y. and Schuurmans, D.",
+title = "Policy iteration for learning an exercise policy for {American} 
+options",
+booktitle = "Proceedings of the European Workshop on Reinforcement 
+Learning (EWRL)",
+year = 2008,
+note = "Acceptance rate 33\%; all authors from my research group"
+}
+
+@inproceedings{lischuurmans08b,
+author = "Li, Y. and Schuurmans, D.",
+title = "Learning an exercise policy for {American} options on real data",
+booktitle = "Proceedings of the International Symposium on Financial 
+Engineering and Risk Management (FERM)",
+year = 2008,
+note = "All authors from my research group; unrefereed publication"
+}
+
+@inproceedings{Li+al-2007,
+    author    = {Jun-Bao Li and Shu-Chuan Chu and Jeng-Shyang Pan},
+    title     = {Locally Discriminant Projection with Kernels for Feature Extraction},
+    booktitle = {Proceedings of the Third International Conference on Advanced Data Mining and Applications},
+    editor    = {Reda Alhajj and Hong Gao and Xue Li and Jianzhong Li and Osmar R. Za\"{\i}ane},
+    publisher = {Springer},
+    year      = {2007},
+    pages     = {586-593},
+    ee        = {http://dx.doi.org/10.1007/978-3-540-73871-8_56},
+    bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@InCollection{Liang83,
+  author =       "F. M. Liang",
+  editor =       "D. E. Knuth",
+  booktitle =    "The \TeX Book",
+  title =        "Ph.{D}.\ Thesis",
+  publisher =    "Addison-Wesley",
+  address =      "Reading",
+  year =         "1986",
+}
+
+@inproceedings{LiangP2008,
+ author = {Percy Liang and Michael I. Jordan},
+ title = {An asymptotic analysis of generative, discriminative, and pseudolikelihood estimators},
+ booktitle =    ICML08,
+ editor =       ICML08ed,
+ publisher =    ICML08publ,
+ year = {2008},
+ isbn = {978-1-60558-205-4},
+ pages = {584--591},
+ location = {Helsinki, Finland},
+ doi = {http://doi.acm.org/10.1145/1390156.1390230},
+ address = {New York, NY, USA},
+ }
+
+@Article{Liberman67,
+  author =       "A. M. Liberman and F. S. Cooper and D. P. Shankweiler
+                 and M. Studdert-Kennedy",
+  title =        "Perception of the speech code",
+  journal =      "Psychological Review",
+  volume =       "74",
+  pages =        "431--461",
+  year =         "1967",
+}
+
+@Article{Lin+al-1991,
+  author =       "W.-M. Lin and V. K. Prasanna and K. W. Przytula",
+  title =        "Algorithmic mapping of neural network Models onto
+                 Parallel {SIMD} Machines",
+  journal =      "IEEE Transactions on Computers",
+  volume =       "40",
+  number =       "12",
+  publisher =    "IEEE Computer Society",
+  address =      "Los Alamitos, CA, USA",
+  pages =        "1390--1401",
+  year =         "1991",
+  ISSN =         "0018-9340",
+  doi =          "http://doi.ieeecomputersociety.org/10.1109/12.106224",
+}
+
+@Article{Lin-2000,
+  author =       "Dekang Lin",
+  title =        "Word sense disambigutation with a similarity based
+                 smoothed library",
+  journal =      "Computers and the Humanities: special issue on
+                 {SENSEVAL}",
+  volume =       "34",
+  pages =        "147--152",
+  year =         "2000",
+}
+
+@InProceedings{Lin-99,
+  author =       "Dekang Lin",
+  booktitle =    "Proceedings of the Conference of the Pacific
+                 Association for Computational Linguistics",
+  title =        "A case-based algorithm for word sense disambiguation",
+  address =      "Waterloo, Canada",
+  year =         "1999",
+}
+
+@Article{Lin73,
+  author =       "S. Lin and B. W. Kernighan",
+  title =        "An Effective Heuristic Algorithm for the Travelling
+                 Salesman Problem",
+  journal =      opres,
+  volume =       "21",
+  pages =        "498--516",
+  year =         "1973",
+}
+
+@TechReport{Lin95,
+  author =       "T. Lin and B. G. Horne and P. Tino and C. L. Giles",
+  title =        "Learning long-term dependencies is not as difficult
+                 with {NARX} recurrent neural networks",
+  number =       "UMICAS-TR-95-78",
+  institution =  "Institute for Advanced Computer Studies, University of
+                 Mariland",
+  year =         "1995",
+}
+
+@InProceedings{Lin96,
+  author =       "C. Lin and S-C. Chang and K-J. Lin",
+  booktitle =    nipc-hmit96,
+  title =        "Simulation of the Balance of Plant of a Nuclear Power
+                 Plant by Neural Networks",
+  volume =       "1",
+  publisher =    ans,
+  pages =        "251--255",
+  year =         "1996",
+}
+
+@Article{Linde80,
+  author =       "Y. Linde and A. Buzo and R. M. Gray",
+  title =        "An algorithm for vector quantizer design",
+  journal =      "IEEE Transactions on Communication",
+  volume =       "COM-28",
+  number =       "1",
+  pages =        "84--95",
+  month =        jan,
+  year =         "1980",
+}
+
+@Article{Lindgren78,
+  author =       "G. Lindgren",
+  title =        "{Markov} Regime Models for Mixed Distributions and
+                 Switching Regressions",
+  journal =      "Scan. J. Statist.",
+  volume =       "5",
+  pages =        "81--91",
+  year =         "1978",
+}
+
+@Article{Linial93,
+  author =       "Nathan Linial and Yishay Mansour and Noam Nisan",
+  title =        "Constant depth circuits, {Fourier} transform, and
+                 learnability",
+  journal =      "J. ACM",
+  volume =       "40",
+  number =       "3",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "607--620",
+  year =         "1993",
+}
+
+@Article{Linsker86,
+  author =       "R. Linsker",
+  title =        "From Basic Network Principles to Neural Architecture",
+  journal =      PNAS,
+  volume =       "83",
+  pages =        "7508--7512, 8390--8394, 8779--8783",
+  year =         "1986",
+}
+
+@Article{Linsker88,
+  author =       "R. Linsker",
+  title =        "Self-Organization in a Perceptual Network",
+  journal =      computer,
+  pages =        "105--117",
+  month =        mar,
+  year =         "1988",
+}
+
+@TechReport{liporace-76,
+  author =       "L. A. Liporace",
+  title =        "{PTAH} on Continuous Multivariate Functions of
+                 {Markov} Chains",
+  number =       "80193",
+  institution =  "Institute for Defense Analysis, Communication Research
+                 Department",
+  month =        feb,
+  year =         "1976",
+}
+
+@Article{Lippmann87,
+  author =       "R. P. Lippmann",
+  title =        "An Introduction to Computing with Neural Nets",
+  journal =      ieeeassp,
+  pages =        "4--22",
+  month =        apr,
+  year =         "1987",
+}
+
+@InProceedings{Lippmann87b,
+  author =       "R. P. Lippmann and B. Gold",
+  booktitle =    "IEEE Proc. First Intl. Conf. on Neural Networks",
+  title =        "Neural Classifiers Useful for Speech Recognition",
+  volume =       "IV",
+  address =      "San Diego, CA",
+  pages =        "417--422",
+  year =         "1987",
+}
+
+@Article{Lippmann89,
+  author =       "R. P. Lippmann",
+  title =        "Review of Neural Networks for Speech Recognition",
+  journal =      nc,
+  volume =       "1",
+  pages =        "1--38",
+  year =         "1989",
+}
+
+@InProceedings{Lister90,
+  author =       "R. Lister",
+  booktitle =    ijcnn,
+  title =        "Segment Reversal and the {TSP}",
+  volume =       "1",
+  publisher =    "Lawrence Erlbaum, Hillsdale",
+  address =      "Washington 1990",
+  pages =        "424--427",
+  year =         "1990",
+}
+
+@Article{Litkowski-2000,
+  author =       "K. Litkowski",
+  title =        "{SENSEVAL}: The {CL}-research experience",
+  journal =      "Computers and the Humanities: special issue on
+                 SENSEVAL",
+  volume =       "34",
+  pages =        "153--158",
+  year =         "2000",
+}
+
+@Book{Little+Rubin-2002,
+  author =       "R. J. A. Little and D. B. Rubin",
+  title =        "Statistical Analysis with Missing Data",
+  publisher =    "Wiley",
+  address =      "New York",
+  edition =      "2nd",
+  year =         "2002",
+}
+
+@Book{Little-Rubin,
+  author =       "R. J. A. Little and D. B. Rubin",
+  title =        "Statistical Analysis with Missing Data",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1987",
+}
+
+@Article{Little74,
+  author =       "W. A. Little",
+  title =        "The Existence of Persistent States in the Brain",
+  journal =      mbio,
+  volume =       "19",
+  pages =        "101--120",
+  year =         "1974",
+}
+
+@Article{Little75,
+  author =       "W. A. Little and G. L. Shaw",
+  title =        "A Statistical Theory of Short and Long Term Memory",
+  journal =      behbio,
+  volume =       "14",
+  year =         "1975",
+}
+
+@Article{Little78,
+  author =       "W. A. Little and G. L. Shaw",
+  title =        "Analytic Study of the Memory Storage Capacity of a
+                 Neural Network",
+  journal =      mbio,
+  volume =       "39",
+  pages =        "281--290",
+  year =         "1978",
+}
+
+@Article{littlestone-warmuth94,
+  author =       "N. Littlestone and M. K. Warmuth",
+  title =        "The weighted majority algorithm",
+  journal =      "Information and Computation",
+  volume =       "108",
+  number =       "2",
+  pages =        "212--261",
+  year =         "1994",
+}
+
+@Misc{Littlestone86,
+  author =       "N. Littlestone and M. Warmuth",
+  title =        "Relating data compression and learnability",
+  year =         "1986",
+  note =         "Unpublished manuscript. University of California Santa
+                 Cruz. An extended version can be found in (Floyd and
+                 Warmuth 95)",
+}
+
+@InCollection{Liu2001,
+  author =       "J. S. Liu & R. Chen & T. Logvinenko",
+  editor =       "N. Gordon {A. Doucet, N. de Freitas}",
+  booktitle =    "Sequential Monte Carlo Methods in Practice",
+  title =        "A theoretical framework for sequential importance
+                 sampling and resampling",
+  publisher =    "Springer-Verlag",
+  year =         "2001",
+}
+
+@Book{Ljung+Soderstrom83,
+  author =       "L. Ljung and T. Soderstrom",
+  title =        "Theory and Practice of recursive identification",
+  publisher =    "MIT Press",
+  year =         "1983",
+}
+
+@Book{Ljung-86,
+  author =       "L. Lyung and T. S{\"o}derstr{\"o}m",
+  title =        "Theory and Practice of Recursive Identification",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "1986",
+}
+
+@article{LloydS1982,
+	author = {Stuart P. Lloyd },
+	booktitle = {Information Theory, IEEE Transactions on},
+	journal = {Information Theory, IEEE Transactions on},
+	number = {2},
+	pages = {129--137},
+	title = {Least squares quantization in PCM},
+	volume = {28},
+	year = {1982}
+}
+
+@Article{Loader96,
+  author =       "C. R. Loader",
+  title =        "Local lieklihood density estimation",
+  journal =      "Annals of Statistics",
+  volume =       "24",
+  number =       "4",
+  pages =        "1602--1618",
+  year =         "1996",
+}
+
+@Article{Loftsgaarden+Quesenberry-65,
+  author =       "D. O. Loftsgaarden and C. P. Quesenberry",
+  title =        "A nonparametric estimate of a multivariate density
+                 function",
+  journal =      "Annals of Mathematical Statistics",
+  volume =       "36",
+  pages =        "1049--1051",
+  year =         "1965",
+}
+
+@InBook{lognormal-A-85,
+  author =       "C. E. Antle",
+  booktitle =    "Encyclopedia of Statistical Sciences",
+  title =        "Lognormal Distribution",
+  volume =       "5",
+  publisher =    "John Wiley \& Sons",
+  pages =        "134--136",
+  year =         "1985",
+}
+
+@Article{Loh-Shih97,
+  author =       "Wei-Yin Loh and Yu-Shan Shih",
+  title =        "Split selection methods for classification trees",
+  journal =      "Statistica Sinica",
+  volume =       "7",
+  pages =        "815--840",
+  year =         "1997",
+}
+
+@incollection{loosli-canu-bottou-2006,
+  author = {Loosli, Ga\"{e}lle and Canu, St\'{e}phane and Bottou, L\'{e}on},
+  title = {Training Invariant Support Vector Machines using Selective Sampling},
+  pages = {301-320},
+  editor = {Bottou, L\'{e}on and Chapelle, Olivier and {DeCoste}, Dennis and Weston, Jason},
+  booktitle = {Large Scale Kernel Machines},
+  publisher = {MIT Press},
+  address = {Cambridge, MA.},
+  year = {2007},
+  url = {http://leon.bottou.org/papers/loosli-canu-bottou-2006},
+}
+
+@Article{Lowe04,
+  author =       "D. G. Lowe",
+  title =        "Distinctive Image Features from Scale-Invariant
+                 Keypoints",
+  journal =      "International Journal of Computer Vision",
+  volume =       "60",
+  number =       "2",
+  pages =        "91--110",
+  year =         "2004",
+}
+
+@Article{Lowe95,
+  author =       "D. G. Lowe",
+  title =        "Similarity metric learning for a variable-kernel
+                 classifier",
+  journal =      "Neural Computation",
+  volume =       "7",
+  number =       "1",
+  pages =        "72--85",
+  year =         "1995",
+}
+
+@InProceedings{lu04,
+  author =       "Wen-Cong Lu and Nian-Yi Chen and Guo-Zheng Li and Jie
+                 Yang",
+  editor =       "Per Svensson and Johan Schubert",
+  booktitle =    "Proceedings of the Seventh International Conference on
+                 Information Fusion",
+  title =        "Multitask learning using partial least square method",
+  volume =       "I",
+  publisher =    "International Society of Information Fusion",
+  address =      "Mountain View, CA",
+  pages =        "79--84",
+  month =        jun,
+  year =         "2004",
+  location =     "Stockholm, Sweden",
+}
+
+@Book{Lue84,
+  author =       "D. G. Luenberger",
+  title =        "Linear and Nonlinear Programming",
+  publisher =    "Addison Wesley",
+  year =         "1984",
+}
+
+@Book{Luenberger86,
+  author =       "D. G. Luenberger",
+  title =        "Linear and Nonlinear Programming",
+  publisher =    "Addison-Wesley",
+  address =      "Reading",
+  year =         "1986",
+}
+
+@InProceedings{Lyu09,
+  author =       "Siwei Lyu",
+  booktitle =    "The proceedings of the 25th Conference on Uncertainty in Artificial Intelligence",
+  title =        "Interpretation and Generalization of Score Matching",
+  year =         "2009",
+}
+
+@Book{Ma85,
+  author =       "S.-K. Ma",
+  title =        "Statistical Mechanics",
+  publisher =    "World Scientific",
+  address =      "Philadelphia",
+  year =         "1985",
+}
+
+@InProceedings{Ma09,
+ author = {Justin Ma and Lawrence K. Saul and Stefan Savage and Geoffrey M. Voelker},
+ title = {Identifying Suspicious URLs: An Application of Large-Scale Online Learning},
+ booktitle = {Proceedings of the International Conference on Machine Learning},
+ year = {2009},
+ pages = {681--688},
+ location = {Montreal, Canada},
+}
+
+@Misc{MacKay+Neal94,
+  author =       "D. MacKay and R. Neal",
+  title =        "Automatic Relevance Determination",
+  year =         "1994",
+  note =         "Unpublished report. See also MacKay D., 1995, Probable
+                 Neworks and Plausible Predictions -- A Review of
+                 Practical {Bayesian} Methods for Supervised Neural
+                 Networks, in {\em Network: Computation in Neural
+                 Systems}, v. 6, pp. 469--505",
+}
+
+@Book{MacKay03,
+  author =       "David MacKay",
+  title =        "Information Theory, Inference and Learning
+                 Algorithms",
+  publisher =    "Cambridge University Press",
+  year =         "2003",
+}
+
+@Misc{MacKay2001,
+  author =       "David MacKay",
+  title =        "Failures of the One-Step Learning Algorithm",
+  year =         "2001",
+  note =         "Unpublished report",
+}
+
+@Article{MacKay90,
+  author =       "D. J. C. MacKay and K. D. Miller",
+  title =        "Analysis of Linsker's Simulation of Hebbian Rules",
+  journal =      nc,
+  volume =       "2",
+  pages =        "173--187",
+  year =         "1990",
+}
+
+@PhdThesis{MacKay91,
+  author =       "D. J. C. MacKay",
+  title =        "Bayesian methods for adaptive models",
+  school =       "California Institute of Technology",
+  year =         "1991",
+}
+
+@Article{MacKay92a,
+  author =       "David {J. C}. MacKay",
+  title =        "Bayesian interpolation",
+  journal =      "Neural Computation",
+  volume =       "4",
+  number =       "3",
+  pages =        "415--447",
+  year =         "1992",
+}
+
+@Article{MacKay92b,
+  author =       "D. J. C. MacKay",
+  title =        "The evidence framework applied to classification
+                 networks",
+  journal =      "Neural Computation",
+  volume =       "4",
+  number =       "5",
+  pages =        "698--714",
+  year =         "1992",
+}
+
+@Article{MacKay92c,
+  author =       "David {J. C}. MacKay",
+  title =        "A practical {Bayesian} framework for backpropagation
+                 networks",
+  journal =      "Neural Computation",
+  volume =       "4",
+  number =       "3",
+  pages =        "448--472",
+  year =         "1992",
+}
+
+@Article{MacKay98,
+  author =       "D. J. C. MacKay and R. J. McEliece and J-F. Cheng (in
+                 press)",
+  title =        "Turbo-decoding as an instance of Pearl's belief
+                 propagation algorithm",
+  journal =      "IEEE Journal on Selected Areas in Communications",
+  year =         "1998",
+}
+
+@TechReport{MacKay98b,
+  author =       "D. J. C. MacKay",
+  title =        "Introduction to {G}aussian Processes",
+  institution =  "Cambridge University",
+  year =         "1998",
+  URL =          "http://wol.ra.phy.cam.ac.uk/mackay/gpB.pdf",
+}
+
+@Article{Mackey77,
+  author =       "M. C. Mackey and L. Glass",
+  title =        "Oscillation and Chaos in Physiological Control
+                 Systems",
+  journal =      science,
+  volume =       "197",
+  pages =        "287",
+  year =         "1977",
+}
+
+@InProceedings{Maclin-iwml91,
+  author =       "R. Maclin and J. W. Shawlik",
+  editor =       "L. Birnbaum and G. Collins",
+  booktitle =    "Machine Learning: Proceedings of the Eighth
+                 International Workshop",
+  title =        "Refining Domain Theories Expressed as Finite-State
+                 Automata",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  year =         "1991",
+}
+
+@Article{Maclin-ml,
+  author =       "R. Maclin and J. W. Shawlik",
+  title =        "Using Knowledge-Based Neural Networks to Improve
+                 Algorithms: Refining the Chou-Fasman Algorithm for
+                 Protein Folding",
+  journal =      mlearn,
+}
+
+@InProceedings{MacQueen67,
+  author =       "James B. MacQueen",
+  booktitle =    "Proceedings of the Fifth Berkeley Symposium on
+                 Mathematics, Statistics and Probability, Vol. 1",
+  title =        "Some Methods for Classification and Analysis of
+                 Multivariate Observations",
+  pages =        "281--296",
+  year =         "1967",
+}
+
+@Article{Mahapatra+al-1997,
+  author =       "S. Mahapatra and R. N. Mahapatra and B. N. Chatterji",
+  title =        "A parallel formulation of back-propagation learning on
+                 distributed memory multiprocessors",
+  journal =      "Parallel Computing",
+  volume =       "22",
+  number =       "12",
+  publisher =    "Elsevier Science Publishers",
+  address =      "Amsterdam, The Netherlands",
+  pages =        "1661--1675",
+  year =         "1997",
+  ISSN =         "0167-8191",
+  doi =          "http://dx.doi.org/10.1016/S0167-8191(96)00051-8",
+}
+
+@incollection{Mairal-2009,
+ title = {Supervised Dictionary Learning},
+ author = {Julien Mairal and Francis Bach and Jean Ponce and Guillermo Sapiro and Andrew Zisserman},
+ booktitle = NIPS21,
+ editor = NIPS21ed,
+ pages = {1033--1040},
+ publisher = {NIPS Foundation},
+ year = {2009}
+}
+@book{Maimon+Rokach-2005,
+    author = {Maimon, O.  and Rokach, L. },
+    howpublished = {Hardcover},
+    isbn = {0387244352},
+    month = {September},
+    publisher = {Springer},
+    title = {Data Mining and Knowledge Discovery Handbook},
+    year = {2005}
+}
+
+@InProceedings{Makram-Ebeid89,
+  author =       "S. Makram-Ebeid and J.-A. Sirat and J.-R. Viala",
+  booktitle =    ijcnn,
+  title =        "A Rationalized Back-Propagation Learning Algorithm",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "373--380",
+  year =         "1989",
+}
+
+@Article{mallat93matching,
+  author =       "S. Mallat and Z. Zhang",
+  title =        "Matching pursuit with time-frequency dictionaries",
+  journal =      "IEEE Trans. Signal Proc.",
+  volume =       "41",
+  number =       "12",
+  pages =        "3397--3415",
+  month =        dec,
+  year =         "1993",
+}
+
+@InProceedings{malouf2002conll,
+  author =       "Robert Malouf",
+  booktitle =    "Proceedings of CoNLL-2002",
+  title =        "A comparison of algorithms for maximum entropy
+                 parameter estimation",
+  publisher =    "Taipei, Taiwan",
+  pages =        "49--55",
+  year =         "2002",
+  editors =      "Dan Roth and Antal van den Bosch",
+}
+
+@Book{Mandelbrot82,
+  author =       "B. B. Mandelbrot",
+  title =        "The Fractal Geometry of Nature",
+  publisher =    "Freeman",
+  address =      "San Francisco",
+  year =         "1982",
+}
+
+@Book{Manning+Schutze99,
+  author =       "Christopher Manning and Hinrich Schutze",
+  title =        "Foundations of Statistical Natural Language
+                 Processing",
+  publisher =    "MIT Press",
+  year =         "1999",
+}
+
+@InProceedings{Mantysalo92firenze,
+  author =       "Jyri M{\"{a}}ntysalo and Kari Torkkola and Teuvo
+                 Kohonen",
+  booktitle =    "Proc. of the Second Workshop on Neural Networks for
+                 Speech Processing",
+  title =        "Experiments on the use of {LVQ} in phoneme-level
+                 segmentation of speech",
+  publisher =    "LINT",
+  address =      "Firenze (Italy)",
+  year =         "1992",
+}
+
+@article{Marcelja-1980,
+    author = {Marcelja, S.},
+    journal = {Journal of the Optical Society of America},
+    month = {November},
+    number = {11},
+    pages = {1297--1300},
+    title = {Mathematical description of the responses of simple cortical cells.},
+    url = {http://view.ncbi.nlm.nih.gov/pubmed/7463179},
+    volume = {70},
+    year = {1980}
+}
+
+@Article{Marchand90,
+  author =       "M. Marchand and M. Golea and P. Ruj\'an",
+  title =        "A Convergence Theorem for Sequential Learning in
+                 Two-Layer Perceptrons",
+  journal =      eul,
+  volume =       "11",
+  pages =        "487--492",
+  year =         "1990",
+}
+
+@Article{Marcotte-92,
+  author =       "P. Marcotte and G. Savard",
+  title =        "Novel approaches to the discrimination problem",
+  journal =      "Zeitschrift f{\"u}r Operations Research (Theory)",
+  volume =       "36",
+  pages =        "517--545",
+  year =         "1992",
+}
+
+@Article{Marcus91,
+  author =       "C. M. Marcus and F. R. Waugh and R. M. Westervelt",
+  title =        "Nonlinear Dynamics and Stability of Analog Neural
+                 Networks",
+  journal =      "Physica D",
+  volume =       "51",
+  pages =        "234--247",
+  year =         "1991",
+  note =         "(special issue)",
+}
+
+@Article{Marcus-et-al91,
+  author =       "C. M. Marcus and F. R. Waugh and R. M. Westervelt",
+  title =        "Nonlinear Dynamics and Stability of Analog Neural
+                 Networks",
+  journal =      physicaD,
+  volume =       "51",
+  pages =        "1991",
+  year =         "1991",
+  note =         "(special issue)",
+}
+
+@Article{Markov13,
+  author =       "A. A. Markov",
+  title =        "An example of statistical investigation in the text of
+                 `Eugene Onyegin' illustrating coupling of `tests' in
+                 chains",
+  journal =      "Proceedings of the Academy of Science, St.
+                 Petersburg",
+  volume =       "7",
+  pages =        "153--162",
+  year =         "1913",
+}
+
+@Article{Markovitz-52,
+  author =       "H. M. Markovitz",
+  title =        "Portfolio Selection",
+  journal =      "Journal of Finance",
+  volume =       "7",
+  number =       "1",
+  pages =        "77--91",
+  year =         "1952",
+}
+
+@InProceedings{maron98,
+  author =       "Oded Maron and Tom\'{a}s Lozano-P\'{e}rez",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "A Framework for Multiple-Instance Learning",
+  volume =       "10",
+  publisher =    "{MIT} Press",
+  year =         "1998",
+}
+
+@Article{Marquardt63,
+  author =       "D. W. Marquardt",
+  title =        "An algorithm for least-squares estimation of
+                 non-linear parameters",
+  journal =      "Journal of the Society of Industrial and Applied
+                 Mathematics",
+  volume =       "11",
+  number =       "2",
+  pages =        "431--441",
+  year =         "1963",
+}
+
+@Article{Marr69,
+  author =       "D. Marr",
+  title =        "A Theory of Cerebellar Cortex",
+  journal =      jphysiol,
+  volume =       "202",
+  pages =        "437--470",
+  year =         "1969",
+}
+
+@Article{Marr70,
+  author =       "D. Marr",
+  title =        "A Theory for Cerebral Neocortex",
+  journal =      PRSLB,
+  volume =       "176",
+  pages =        "161--234",
+  year =         "1970",
+}
+
+@Article{Marr71,
+  author =       "D. Marr",
+  title =        "Simple Memory: {A} Theory for Archicortex",
+  journal =      PTRSL,
+  volume =       "262",
+  pages =        "23--81",
+  year =         "1971",
+}
+
+@Article{Marr76,
+  author =       "D. Marr and T. Poggio",
+  title =        "Cooperative Computation of Stereo Disparity",
+  journal =      science,
+  volume =       "194",
+  year =         "1976",
+}
+
+@Book{Marr82,
+  author =       "D. Marr",
+  title =        "Vision",
+  publisher =    "Freeman",
+  address =      "San Francisco",
+  year =         "1982",
+}
+
+@Article{Martin91,
+  author =       "G. L. Martin and J. A. Pittman",
+  title =        "Recognizing hand-printed letters and digits using
+                 backpropagation learning",
+  journal =      nc,
+  volume =       "3",
+  number =       "2",
+  pages =        "258--267",
+  year =         "1991",
+}
+
+@Article{Mashouk+Reed91,
+  author =       "K. A. Al-Mashouq and I. S. Reed",
+  title =        "Including Hints in Training Neural Nets",
+  journal =      nc,
+  volume =       "3",
+  number =       "4",
+  pages =        "418",
+  year =         "1991",
+}
+
+@InProceedings{Mason98,
+  author =       "L. Mason and Bartlett and J. P. Baxter",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Direct Optimization of Margins Improves Generalization
+                 in Combined Classifiers",
+  year =         "1999",
+}
+
+@InProceedings{Mason99,
+  author =       "L. Mason and J. Baxter and P. Bartlett and M. Frean",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Boosting Algorithms as Gradient Descent",
+  publisher =    "MIT Press",
+  pages =        "512--518",
+  year =         "2000",
+}
+
+@InProceedings{Matan92,
+  author =       "O. Matan and C. J. C. Burges and Y. {LeCun} and J. S.
+                 Denker",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Multi-Digit Recognition Using a Space Displacement
+                 Neural Network",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "488--495",
+  year =         "1992",
+}
+
+@InProceedings{matic-92a,
+  author =       "N. Mati\'{c} and I. Guyon and L. Bottou and J. Denker
+                 and V. Vapnik",
+  booktitle =    "11th International Conference on Pattern Recognition",
+  title =        "Computer Aided Cleaning of Large Databases for
+                 Character Recogn ition",
+  volume =       "II",
+  pages =        "330--333",
+  year =         "1992",
+}
+
+@Misc{matrix-cookbook,
+  author =       "K. B. Petersen and M. S. Pedersen",
+  title =        "The Matrix Cookbook",
+  publisher =    "Technical University of Denmark",
+  address =      "",
+  month =        feb,
+  year =         "2006",
+  note =         "Version 20051003",
+  abstract =     "Matrix identities, relations and approximations. A
+                 desktop reference for quick overview of mathematics of
+                 matrices.",
+  keywords =     "Matrix identity, matrix relations, inverse, matrix
+                 derivative",
+}
+
+@Article{Mattis76,
+  author =       "D. Mattis",
+  title =        "Solvable Spin Systems with Random Interactions",
+  journal =      plettA,
+  volume =       "56",
+  pages =        "421--422",
+  year =         "1976",
+}
+
+@Article{MaxEnt96,
+  author =       "Adam L. Berger and Vincent J. {Della Pietra} and Stephen A. {Della
+                 Pietra}",
+  title =        "A maximum entropy approach to natural language
+                 processing",
+  journal =      "Computational Linguistics",
+  volume =       "22",
+  pages =        "39--71",
+  year =         "1996",
+}
+
+@Article{Mayraz+Hinton-2002,
+  author =       "G. Mayraz and G. E. Hinton",
+  title =        "Recognizing handwritten digits using hierarchical
+                 products of experts",
+  journal =      "IEEE Transactions on Pattern Analysis and Machine
+                 Intelligence",
+  volume =       "24",
+  pages =        "189--197",
+  year =         "2002",
+}
+
+@InProceedings{Mazaika87,
+  author =       "P. K. Mazaika",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "A Mathematical Model of the {Boltzmann} Machine",
+  volume =       "3",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "157--163",
+  year =         "1987",
+}
+
+@InProceedings{mbbf-bagd-00,
+  author =       "L. Mason and J. Baxter and P. L. Bartlett and M.
+                 Frean",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Boosting algorithms as gradient descent",
+  pages =        "512--518",
+  year =         "2000",
+}
+
+@InProceedings{McCallum+Nigam-1998,
+  author =       "A. {McCallum} and K. Nigam",
+  booktitle =    ICML08,
+  editor =       ICML08ed,
+  publisher =    ICML08publ,
+  title =        "Employing {EM} and pool-based active learning for text
+                 classification",
+  year =         "1998",
+}
+
+@InProceedings{McCallumA2006,
+  author =       "Andrew McCallum and Chris Pal and Gregory Druck and
+                 Xuerui Wang",
+  booktitle =    "Twenty-first National Conference on Artificial
+                 Intelligence (AAAI-06)",
+  title =        "Multi-Conditional Learning: Generative/Discriminative
+                 Training for Clustering and Classification",
+  publisher =    "AAAI Press",
+  year =         "2006",
+  OPTbibsource = "DBLP, http://dblp.uni-trier.de",
+  OPTcrossref =  "DBLP:conf/aaai/2006",
+}
+
+@article{McClelland+Rumelhart-81,
+ author = {James L. {McClelland} and David E. Rumelhart},
+ title = {An interactive activation model of context effects in letter perception},
+ journal = {Psychological Review},
+ volume = 88,
+ pages = {375--407},
+ year = 1981,
+}
+
+@Book{McClelland86a,
+  author =       "James L. McClelland and David E. Rumelhart and the PDP
+                 Research Group",
+  title =        "Parallel Distributed Processing: Explorations in the
+                 Microstructure of Cognition",
+  volume =       "2",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1986",
+}
+
+@InCollection{McClelland86b,
+  author =       "J. L. McClelland and J. L. Elman",
+  editor =       "J. L. McClelland and D. E. Rumelhart",
+  booktitle =    pdp,
+  title =        "Interactive Processes in Speech Perception: The
+                 {TRACE} Model",
+  chapter =      "15",
+  volume =       "2",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  pages =        "58--121",
+  year =         "1986",
+}
+
+@Book{McClelland88,
+  author =       "J. L. McClelland and D. E. Rumelhart",
+  title =        "Explorations in Parallel Distributed Processing",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1988",
+}
+
+@Article{McCulloch43,
+  author =       "W. S. McCulloch and W. Pitts",
+  title =        "A Logical Calculus of Ideas Immanent in Nervous
+                 Activity",
+  journal =      bmbiophys,
+  volume =       "5",
+  pages =        "115--133",
+  year =         "1943",
+}
+
+@InProceedings{Mcdermott89,
+  author =       "E. McDermott and S. Katagiri",
+  booktitle =    icassp,
+  title =        "Shift-Invariant, Multi-Category Phoneme Recognition
+                 using {Kohonen's} {LVQ2}",
+  volume =       "1",
+  organization = "IEEE",
+  address =      "Glasgow, Scotland",
+  pages =        "81--84",
+  year =         "1989",
+}
+
+@Article{Mcdermott91,
+  author =       "E. McDermott and S. Katagiri",
+  title =        "{LVQ}-based shift-tolerant phoneme recognition",
+  journal =      "IEEE Transactions on Signal Processing",
+  volume =       "39",
+  number =       "6",
+  pages =        "1398--1411",
+  year =         "1991",
+  OPTmonth =     "June",
+}
+
+@Article{McEliece87,
+  author =       "R. J. McEliece and E. C. Posner and E. R. Rodemich and
+                 S. S. Venkatesh",
+  title =        "The Capacity of the Hopfield Associative Memory",
+  journal =      ieeeit,
+  volume =       "33",
+  pages =        "461--482",
+  year =         "1987",
+}
+
+@InProceedings{McInerny89,
+  author =       "J. M. McInerny and K. G. Haines and S. Biafore and R.
+                 Hecht-Nielsen",
+  booktitle =    ijcnn,
+  title =        "Back Propagation Error Surfaces Can Have Local
+                 Minima",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "627",
+  year =         "1989",
+}
+
+@Book{McLachlan2000,
+  author =       "G. J. McLachlan and D. Peel",
+  title =        "Finite Mixture Models",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "2000",
+}
+
+@Book{McLachlan88,
+  author =       "G. J. McLachlan and K. E. Basford",
+  title =        "Mixture models: Inference and applications to
+                 clustering.",
+  publisher =    "Marcel Dekker",
+  year =         "1988",
+}
+
+@book{Mclachlan-2004,
+    author = {Geoffrey  J. Mclachlan},
+    howpublished = {Paperback},
+    isbn = {0471691151},
+    month = {August},
+    publisher = {Wiley-Interscience},
+    title = {Discriminant Analysis and Statistical Pattern Recognition},
+    year = {2004}
+}
+
+@Article{McLoone+Irwin-1997,
+  author =       "S. McLoone and G. W. Irwin",
+  title =        "Fast Parallel Off-Line Training of Multilayer
+                 Perceptrons",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "8",
+  number =       "3",
+  pages =        "646--653",
+  year =         "1997",
+}
+
+@Book{Mead89,
+  author =       "C. Mead",
+  title =        "Analog {VLSI} and Neural Systems",
+  publisher =    "Addison Wesley",
+  address =      "Reading",
+  year =         "1989",
+}
+
+@InProceedings{Meila96,
+  author =       "M. Meila and M. I. Jordan",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Learning fine motion by Markov mixtures of experts",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@InProceedings{Mel+Koch90,
+  author =       "Bartlett W. Mel and Christof Koch",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "{Sigma}-{Pi} Learning: On Radial Basis Functions and
+                 Cortical Associative Learning",
+  publisher =    "Morgan Kaufmann",
+  pages =        "474--481",
+  year =         "1990",
+}
+
+@InProceedings{Melvilleetal,
+  author =       "P. Melville and R. J. Mooney and R. Nagarajan",
+  booktitle =    "Proceedings of the ACM SIGIR Workshop on Recommender
+                 Systems",
+  title =        "Content-boosted collaborative filtering",
+  month =        sep,
+  year =         "2001",
+  keywords =     "boosted collaborative filtering content",
+  location =     "New Orleans, LA",
+}
+
+@InProceedings{Memisevic+Hinton-2007,
+  author =       "Roland Memisevic and Geoffrey E. Hinton",
+  booktitle =    cvpr07,
+  title =        "Unsupervised learning of image transformations",
+  year =         "2007",
+}
+
+@PhdThesis{Memisevic-thesis,
+  author =       "Roland Memisevic",
+  title =        "Non-linear latent factor models for revealing
+                 structure in high-dimensional data",
+  school =       "Departement of Computer Science, University of
+                 Toronto",
+  address =      "Toronto, Ontario, Canada",
+  year =         "2007",
+}
+
+@Book{Mendelson97,
+  author =       "E. Mendelson",
+  title =        "Introduction to Mathematical Logic, 4th ed.",
+  publisher =    "Chapman \& Hall",
+  year =         "1997",
+}
+
+@InProceedings{Merkel-1994,
+  author =       "Magnus Merkel and Bernt Nilsson and Lars Ahrenberg",
+  booktitle =    "Proceedings of the 4th Workshop on Very Large
+                 Corpora",
+  title =        "A Phrase-Retrieval System Based on Recurrence",
+  address =      "Tokyo, Japan",
+  year =         "1994",
+}
+
+@InProceedings{Merkel-2000,
+  author =       "Magnus Merkel and Mikael Andersson",
+  booktitle =    "Proceedings of RIAO'2000",
+  title =        "Knowledge-lite extraction of multi-word units with
+                 language filters and entropy thresholds",
+  volume =       "1",
+  pages =        "737--746",
+  year =         "2000",
+}
+
+@InProceedings{Merlo86,
+  author =       "E. Merlo and R. De Mori and G. Mercier and M.
+                 Palakal",
+  booktitle =    icassp,
+  title =        "A continuous parameter and frequency domain based
+                 {Markov} model",
+  pages =        "1597--1600",
+  year =         "1986",
+}
+
+@article{Merzenich-2000,
+    title = {Seeing in the Sound Zone},
+    author = {M. Merzenich},
+    journal = {Nature},
+    pages = {820--821},
+    volume = {404},
+    year = {2000},
+}
+
+@Article{Metropolis53,
+  author =       "N. Metropolis and A. W. Rosenbluth and M. N.
+                 Rosenbluth and A. H. Teller and E. Teller",
+  title =        "Equation of State Calculations for Fast Computing
+                 Machines",
+  journal =      jcp,
+  volume =       "21",
+  pages =        "1087--1092",
+  year =         "1953",
+}
+
+@Article{Mezard85,
+  author =       "M. M\'ezard and G. Parisi",
+  title =        "Replicas and Optimization",
+  journal =      jppl,
+  volume =       "46",
+  pages =        "771--778",
+  year =         "1985",
+}
+
+@Article{Mezard86,
+  author =       "M. M\'ezard and G. Parisi",
+  title =        "A Replica Analysis of the Travelling Salesman
+                 Problem",
+  journal =      jpp,
+  volume =       "47",
+  pages =        "1285--1296",
+  year =         "1986",
+}
+
+@Book{Mezard87,
+  author =       "M. M\'ezard and G. Parisi and M. A. Virasoro",
+  title =        "Spin Glass Theory and Beyond",
+  publisher =    "World Scientific",
+  address =      "Singapore",
+  year =         "1987",
+}
+
+@Article{Mezard88,
+  author =       "M. M\'ezard and G. Parisi",
+  title =        "The Euclidean Matching Problem",
+  journal =      jpp,
+  volume =       "49",
+  pages =        "2019--2025",
+  year =         "1988",
+}
+
+@Article{Mezard89,
+  author =       "M. M\'ezard and J.-P. Nadal",
+  title =        "Learning in Feedforward Layered Networks: The Tiling
+                 Algorithm",
+  journal =      jpa,
+  volume =       "22",
+  pages =        "2191--2204",
+  year =         "1989",
+}
+
+@Article{Micchelli-1986,
+  author =       "C. A. Micchelli",
+  title =        "Interpolation of scattered data: distance matrices and
+                 conditionally positive definite functions",
+  journal =      "Constructive Approximation",
+  volume =       "2",
+  pages =        "11--22",
+  year =         "1986",
+}
+
+@InProceedings{micchelli05,
+  author =       "Charles A. {Micchelli} and Massimiliano {Pontil}",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Kernels for Multi--task Learning",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "921--928",
+  year =         "2005",
+}
+
+@InProceedings{Mihalcea2002,
+  author =       "Rada Mihalcea",
+  booktitle =    "Proceedings of the 6th Conference on Natural Language
+                 Learning",
+  title =        "Instance Based Learning with Automatic Feature
+                 Selection Applied to Word",
+  year =         "2002",
+  URL =          "citeseer.nj.nec.com/587173.html",
+}
+
+@Article{Miikkulainen91,
+  author =       "R. Miikkulainen and M. G. Dyer",
+  title =        "Natural language processing with modular {PDP}
+                 networks and distributed lexicon",
+  journal =      "Cognitive Science",
+  volume =       "15",
+  pages =        "343--399",
+  year =         "1991",
+}
+
+@Article{Miller+Sachs83,
+  author =       "M. M. Miller and M. B. Sachs",
+  title =        "Representation of stop consonants in the discharge
+                 patterns of auditory nerve fibers",
+  journal =      jasa,
+  volume =       "74",
+  number =       "2",
+  pages =        "502--517",
+  year =         "1983",
+}
+
+@PhdThesis{miller02,
+  author =       "Erik G. Miller",
+  title =        "Learning from one example in machine vision by sharing
+                 probability densities",
+  school =       "Massachusetts Institute of Technology",
+  year =         "2002",
+}
+
+@PhdThesis{miller02one,
+  author =       "Erik Miller",
+  title =        "Learning from one example in machine vision by sharing
+                 probability densities",
+  school =       "Massachusetts Institute of Technology, Department of
+                 Electrical Engineering and Computer Science",
+  year =         "2002",
+}
+
+@InProceedings{Miller89,
+  author =       "G. F. Miller and P. M. Todd and S. U. Hegde",
+  editor =       "J. D. Schaffer",
+  booktitle =    "Proceedings of the Third International Conference on
+                 Genetic Algorithms",
+  title =        "Designing Neural Networks Using Genetic Algorithms",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Arlington 1989",
+  pages =        "379--384",
+  year =         "1989",
+}
+
+@Article{MillerD1996,
+  author =       "David Miller and Kenneth Rose",
+  title =        "Hierarchical, unsupervised learning with growing via
+                 phase transitions",
+  journal =      "Neural Computation",
+  volume =       "8",
+  number =       "2",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA, USA",
+  pages =        "425--450",
+  year =         "1996",
+  ISSN =         "0899-7667",
+}
+
+@Article{Miller-ijprai93,
+  author =       "C. B. Miller and C. L. Giles",
+  title =        "Experimental Comparison of the Effect of Order in
+                 Recurrent Neural Networks",
+  journal =      "Int. Journal of Pattern Recognition and Artificial
+                 Intelligence",
+  pages =        "205--228",
+  year =         "1993",
+  note =         "Special Issue on Applications of Neural Networks to
+                 Pattern Recognition (I. Guyon Ed.)",
+}
+
+@Book{Minc-88,
+  author =       "H. Minc",
+  title =        "Nonnegative Matrices",
+  publisher =    "John Wiley \& Sons",
+  address =      "New York",
+  year =         "1988",
+}
+
+@Book{Minsky67,
+  author =       "M. L. Minsky",
+  title =        "Computation: Finite and Infinite Machines",
+  publisher =    "Prentice-Hall",
+  address =      "Englewood Cliffs",
+  year =         "1967",
+}
+
+@Book{Minsky69,
+  author =       "M. L. Minsky and S. A. Papert",
+  title =        "Perceptrons",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1969",
+}
+
+@Article{Misra-1997,
+  author =       "Manavendra Misra",
+  title =        "Parallel Environments for Implementing Neural
+                 Networks",
+  journal =      "Neural Computing Surveys",
+  volume =       "1",
+  pages =        "48--60",
+  year =         "1997",
+}
+
+@Article{Mitchison89,
+  author =       "G. J. Mitchison and R. M. Durbin",
+  title =        "Bounds on the Learning Capacity of Some Multi-Layer
+                 Networks",
+  journal =      biocyb,
+  volume =       "60",
+  pages =        "345--356",
+  year =         "1989",
+}
+
+@Article{ML:Bauer:boost,
+  author =       "Eric Bauer and Ron Kohavi",
+  title =        "An empirical comparison of voting classification
+                 algorithms: Bagging, Boosting, and variants",
+  journal =      "Machine Learning",
+  year =         "1998",
+}
+
+@Article{ML:Breiman:bagging,
+  author =       "Leo Breiman",
+  title =        "Bagging Predictors",
+  journal =      "Machine Learning",
+  volume =       "24",
+  number =       "2",
+  pages =        "123--140",
+  year =         "1994",
+}
+
+@Article{ML:Dietterich:adaboost+noise,
+  author =       "Thomas G. Dietterich",
+  title =        "An experimental comparison of three methods for
+                 constructing ensembles of decision trees: Bagging,
+                 Boosting, and randomization",
+  journal =      "submitted to Machine Learning",
+  year =         "1998",
+  note =         "\\available at {\tt
+                 ftp://ftp.cs.orst.edu/pub/tgd/papers/tr-randomized-c4.ps.gz}",
+}
+
+@Article{ML:Schapire:weaklearn,
+  author =       "Robert E. Schapire",
+  title =        "The strength of weak learnability",
+  journal =      "Machine Learning",
+  volume =       "5",
+  number =       "2",
+  pages =        "197--227",
+  year =         "1990",
+}
+
+@Misc{MLJ-model-selection-combination-2001,
+  author =       "Y. Bengio and D. Schuurmans",
+  title =        "Special Issue on New methods for model selection and
+                 model combination",
+  year =         "2002",
+  note =         "{\em Machine Learning}, 48(1)",
+}
+
+@InProceedings{Mnih+Hinton-2007,
+  author =       "Andriy Mnih and Geoffrey E. Hinton",
+  booktitle =    ICML07,
+  editor =       ICML07ed,
+  publisher =    ICML07publ,
+  title =        "Three New Graphical Models for Statistical Language
+                 Modelling",
+  pages =        "641--648",
+  year =         "2007",
+}
+
+@InProceedings{Mnih+Hinton-2007-small,
+  author =       "Andriy Mnih and Geoffrey E. Hinton",
+  booktitle =    "ICML 2007",
+  title =        "Three New Graphical Models for Statistical Language
+                 Modelling",
+  year =         "2007",
+}
+
+@InProceedings{Mnih+Hinton-2009,
+  author =       "Andriy Mnih and Geoffrey E. Hinton",
+  booktitle =    NIPS21,
+  editor =       NIPS21ed,
+  title =        {A Scalable Hierarchical Distributed Language Model},
+  pages =        {1081--1088},
+  year =         "2009",
+}
+
+@InProceedings{mohri-pereira-riley96,
+  author =       "M. Mohri and F. C. N. Pereira and M. D. Riley",
+  booktitle =    "ECAI 96, 12th European Conference on Artificial
+                 Intelligence",
+  title =        "Weighted automata in text and speech processing",
+  pages =        "",
+  year =         "1996",
+}
+
+@Article{Mohri96,
+  author =       "M. Mohri",
+  title =        "Finite-State Transducers in Language and Speech
+                 Processing",
+  journal =      "Computational Linguistics",
+  volume =       "20",
+  number =       "1",
+  pages =        "1--33",
+  year =         "1996",
+}
+
+@InProceedings{Molina02,
+  author =       "A. Molina and F. Pla and E. Segarra and L. Moreno",
+  booktitle =    "{Proceedings of 3rd International Conference on
+                 Language Resources and Evaluation, LREC2002}",
+  title =        "{Word Sense Disambiguation using Statistical Models
+                 and {WordNet}}",
+  address =      "{Las Palmas de Gran Canaria, Spain}",
+  year =         "2002",
+}
+
+@PhdThesis{moller,
+  author =       "M. {Moller}",
+  title =        "Efficient Training of Feed-Forward Neural Networks",
+  school =       "Aarhus University",
+  address =      "Aarhus, Denmark",
+  year =         "1993",
+}
+
+@InProceedings{moller-92,
+  author =       "M. Moller",
+  booktitle =    "Neural Networks for Signal Processing 2",
+  title =        "supervised learning on large redundant training sets",
+  publisher =    "IEEE press",
+  year =         "1992",
+}
+
+@InProceedings{Momma2003,
+  author =       "M. Momma and K. P. Bennett",
+  booktitle =    colt03,
+  title =        "Sparse Kernel Partial Least Squares Regression",
+  year =         "2003",
+}
+
+@InProceedings{Montana89,
+  author =       "D. J. Montana and L. Davis",
+  editor =       "N. S. Sridharan",
+  booktitle =    "Eleventh International Joint Conference on Artificial
+                 Intelligence",
+  title =        "Training Feedforward Networks Using Genetic
+                 Algorithms",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Detroit 1989",
+  pages =        "762--767",
+  year =         "1989",
+}
+
+@InProceedings{Moody88,
+  author =       "J. Moody and C. Darken",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "Learning with Localized Receptive Fields",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "133--143",
+  year =         "1988",
+}
+
+@Article{Moody89,
+  author =       "J. Moody and C. Darken",
+  title =        "Fast Learning in Networks of Locally-Tuned Processing
+                 Units",
+  journal =      nc,
+  volume =       "1",
+  pages =        "281--294",
+  year =         "1989",
+}
+
+@InProceedings{Moody92,
+  author =       "J. E. Moody",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "The Effective Number of Parameters: An Analysis of
+                 Generalization and Regularization in Nonlinear Learning
+                 Systems",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "847--854",
+  year =         "1992",
+}
+
+@InProceedings{Moody92b,
+  author =       "J. Moody and J. Utans",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Principled architecture selection for neural networks:
+                 application to corporate bond rating prediction",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "683--690",
+  year =         "1992",
+}
+
+@Article{moody93,
+  author =       "J. Moody and U. Levin and S. Rehfuss",
+  title =        "Predicting the {U.S.} Index of Industrial Production",
+  journal =      "Neural Network World",
+  volume =       "3",
+  number =       "6",
+  pages =        "791--794",
+  year =         "1993",
+}
+
+@InCollection{Moody94,
+  author =       "J. Moody",
+  booktitle =    "From Statistics to Neural Networks: Theory and Pattern
+                 Recognition Applications",
+  title =        "Prediction Risk and Architecture Selection for Neural
+                 Networks",
+  publisher =    "Springer",
+  year =         "1994",
+}
+
+@InCollection{Moody98,
+  author =       "J. Moody",
+  editor =       "G. B. Orr and K-R. Muller",
+  booktitle =    "Neural Networks: Tricks of he Trade",
+  title =        "Forecasting the economy with neural nets: a survey of
+                 challenges",
+  publisher =    "Springer",
+  pages =        "347--372",
+  year =         "1998",
+}
+
+@InProceedings{Moore88,
+  author =       "B. Moore",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "{ART}1 and Pattern Clustering",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "174--185",
+  year =         "1988",
+}
+
+@InProceedings{MoosmannF2007,
+  author =       "Frank Moosmann and Bill Triggs and Frederic Jurie",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19ed,
+  title =        "Fast Discriminative Visual Codebooks using Randomized
+                 Clustering Forests",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "985--992",
+  year =         "2007",
+}
+
+@InCollection{More+Wu-1996,
+  author =       "Jorge More and Zhijun Wu",
+  editor =       "G. Di Pillo and F. Giannessi",
+  booktitle =    "Nonlinear Optimization and Applications",
+  title =        "Smoothing techniques for macromolecular global
+                 optimization",
+  publisher =    "Plenum Press",
+  year =         "1996",
+}
+
+@InProceedings{Morgan+Bourlard90b,
+  author =       "N. Morgan and H. Bourlard",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Generalization and parameter estimation in feedforward
+                 nets: some experiments",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "413--416",
+  year =         "1990",
+}
+
+@InProceedings{Morgan90,
+  author =       "N. Morgan and H. Bourlard",
+  booktitle =    icassp,
+  title =        "Continuous Speech Recognition Using Multilayer
+                 Perceptrons with Hidden {Markov} Models",
+  address =      "Albuquerque, NM",
+  pages =        "413--416",
+  year =         "1990",
+}
+
+@InProceedings{Morgan93,
+  author =       "M. Cohen and H. Franco and N. Morgan and D. Rumelhart
+                 and V. Abrash",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Context-Dependent Multiple Distribution Phonetic
+                 Modeling with {MLP}s",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "649--657",
+  year =         "1993",
+}
+
+@InProceedings{Morgan95,
+  author =       "N. Morgan and Y. Konig and S. L. Wu and H. Bourlard",
+  booktitle =    "Proceedings of IEEE Automatic Speech Recognition
+                 Workshop (Snowbird)",
+  title =        "Transition-based Statistical Training for {ASR}",
+  pages =        "133--134",
+  year =         "1995",
+}
+
+@InProceedings{Morin+Bengio-2005,
+  author =       "Fr\'ed\'eric Morin and Yoshua Bengio",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "Hierarchical Probabilistic Neural Network Language
+                 Model",
+  publisher =    "",
+  date =         "Jan 6-8, 2005",
+  location =     "Savannah Hotel, Barbados",
+  pages =        "246--252",
+  year =         "2005",
+}
+
+@Article{Mosesova-2006,
+  author =       "S. A. Mosesova and H. A. Chipman and R. J. MacKay and
+                 S. H. Steiner",
+  title =        "Profile monitoring using mixed effects models",
+  journal =      "Submitted to Technometrics",
+  year =         "2006",
+}
+
+@Article{MosesY1996,
+  author =       "Y. Moses and S. Ullman and S. Edelman",
+  title =        "Generalization to novel images in upright and inverted
+                 faces",
+  journal =      "Perception",
+  volume =       "25",
+  number =       "4",
+  pages =        "443--461",
+  year =         "1996",
+  OPTannote =    "",
+  OPTkey =       "",
+  OPTmonth =     "",
+  OPTnote =      "",
+}
+
+@Article{Movellan-2002,
+  author =       "Javier R. Movellan and Paul Mineiro and R. J. Williams",
+  title =        "A Monte-Carlo {EM} approach for partially observable
+                 diffusion processes: theory and applications to neural
+                 networks",
+  journal =      "Neural Computation",
+  volume =       "14",
+  pages =        "1501--1544",
+  year =         "2002",
+}
+
+@TechReport{Movelland+McClelland91,
+  author =       "Javier R. Movellan and James L. McClelland",
+  title =        "Learning Continuous Probability Distributions with the
+                 Contrastive {Hebbian} Algorithm",
+  number =       "PDP.CNS.91.2",
+  institution =  "Carnegie Mellon University, Dept. of Psychology",
+  address =      "Pittsburgh, PA",
+  year =         "1991",
+}
+
+@InCollection{Mozer+Smolensky89,
+  author =       "M. C. Mozer and P. Smolensky",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "Skeletonization: {A} technique for trimming the fat
+                 from a network via relabance assessment",
+  publisher =    "Morgan Kaufmann",
+  pages =        "107--115",
+  year =         "1989",
+}
+
+@InProceedings{Mozer-nips92,
+  author =       "M. C. Mozer",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "The induction of Multiscale Temporal Structure",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "275--282",
+  year =         "1992",
+}
+
+@Article{mozer-smolensky-89,
+  author =       "M. C. Mozer and P. Smolensky",
+  key =          "Mozer",
+  title =        "Using relevance to reduce network size automatically",
+  journal =      "Connection Science",
+  volume =       "1",
+  number =       "1",
+  pages =        "3--16",
+  year =         "1989",
+}
+
+@Article{Mozer-trnn2000,
+  author =       "M. C. Mozer and R. Wolniewicz and D. B. Grimes and E.
+                 Johnson and H. Kaushansky",
+  title =        "Predicting Subscriber Dissatisfaction and Improving
+                 Retention in the Wireless Telecommunications Industry",
+  journal =      "IEEE Transactions on Neural Networks, special issue on
+                 Data Mining and Knowledge Discovery",
+  volume =       "11",
+  number =       "3",
+  year =         "2000",
+}
+
+@Article{Mozer89,
+  author =       "M. C. Mozer",
+  title =        "A Focused Back-Propagation Algorithm for Temporal
+                 Pattern Recognition",
+  journal =      cs,
+  volume =       "3",
+  pages =        "349--381",
+  year =         "1989",
+}
+
+@InCollection{Mozer93,
+  author =       "M. C. Mozer",
+  editor =       "A. Weigend and N. Gershenfeld",
+  booktitle =    "Predicting the Future and Understanding the Past",
+  title =        "Neural net architectures for temporal sequence
+                 processing",
+  publisher =    "Addison-Wesley",
+  address =      "Redwood City, CA",
+  pages =        "243--264",
+  year =         "1993",
+}
+
+@TechReport{MPIforum,
+  author =       "Jack Dongarra and David Walker and {The Message
+                 Passing Interface Forum}",
+  title =        "{MPI}: {A} Message Passing Interface Standard",
+  number =       "http://www-unix.mcs.anl.gov/mpi",
+  institution =  "University of Tenessee",
+  year =         "1995",
+}
+
+@Article{multidimensional-FGS-83,
+  author =       "J. H. Friedman and E. Grosse and W. Suetzle",
+  title =        "Multidimensional additive spline approximation",
+  journal =      "SIAM Journal of Scientific and Statistical Computing",
+  volume =       "4",
+  number =       "2",
+  pages =        "291--301",
+  year =         "1983",
+}
+
+@InProceedings{Munro87,
+  author =       "P. Munro",
+  booktitle =    "The Ninth Annual Conference of the Cognitive Science
+                 Society",
+  title =        "A Dual Back-Propagation Scheme for Scalar Reward
+                 Learning",
+  publisher =    "Lawrence Erlbaum, Hillsdale",
+  address =      "Seattle 1987",
+  pages =        "165--176",
+  year =         "1987",
+}
+
+@InProceedings{MurraySal09,
+author=         "Iain Murray and Ruslan Salakhutdinov",
+title=          "Evaluating probabilities under high-dimensional latent variable models",
+editor =        NIPS21ed,
+booktitle=      NIPS21,
+volume=         "21",
+pages =         "1137--1144",
+year=           "2009"
+}
+
+@InProceedings{Murveit93,
+  author =       "H. Murveit and J. Butzberger and V. Digilakis and M.
+                 Weintraub",
+  booktitle =    icassp,
+  title =        "Large-vocabulary dictation using {SRI}'s {DECIPHER}
+                 speech recognition system: Progressive search
+                 techniques knowledge for continuous speech
+                 recognition",
+  address =      "Minneapolis, Minnesota",
+  pages =        "319--322",
+  year =         "1993",
+}
+
+@Article{Muselli97,
+  author =       "M. Muselli",
+  title =        "On convergence properties of pocket algorithm",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "8",
+  pages =        "623--629",
+  year =         "1997",
+}
+
+@article{Mutch-Lowe-2008,
+ author = {Jim Mutch and David G. Lowe}, 
+ title = {Object class recognition and localization using sparse features with limited receptive fields}, 
+ journal = {International Journal of Computer Vision}, 
+ volume = 80, 
+ number = 1,
+ year = 2008, 
+ pages = {45--57},
+}
+
+@Article{myles90multiclass,
+  author =       "J. Myles and D. Hand",
+  title =        "The Multi-Class Measure Problem in Nearest Neighbour
+                 Discrimination Rules",
+  journal =      "Pattern Recognition",
+  volume =       "23",
+  pages =        "1291--1297",
+  year =         "1990",
+}
+
+@Article{Nadal86,
+  author =       "J.-P. Nadal and J.-P. Changeux G. Toulouse and S.
+                 Dehaene",
+  title =        "Networks of Formal Neurons and Memory Palimpsests",
+  journal =      eul,
+  volume =       "1",
+  pages =        "535--542",
+  year =         "1986",
+}
+
+@Article{Nadaraya64,
+  author =       "E. A. Nadaraya",
+  title =        "On estimating regression",
+  journal =      "Theory of Probability and its Applications",
+  volume =       "9",
+  pages =        "141--142",
+  year =         "1964",
+}
+
+@Article{Nadaraya65,
+  author =       "E. A. Nadaraya",
+  title =        "On nonparametric estimates of density functions and
+                 regression curves",
+  journal =      "Theory of Applied Probability",
+  volume =       "10",
+  pages =        "186--190",
+  year =         "1965",
+}
+
+@Article{Nadas85,
+  author =       "Arthur Nádas",
+  title =        "On {Turing's} Formula for Word Probabilities",
+  journal =      "IEEE Transactions on Acoustics, Speech, and Signal
+                 Processing",
+  volume =       "33",
+  number =       "6",
+  pages =        "1415--1417",
+  month =        dec,
+  year =         "1985",
+  copy =         yes,
+}
+
+@Article{Nadas85-small,
+  author =       "Arthur Nádas",
+  title =        "On {Turing's} Formula for Word Probabilities",
+  journal =      "ASSP",
+  volume =       "33",
+  number =       "6",
+  pages =        "1415--1417",
+  month =        dec,
+  year =         "1985",
+  copy =         yes,
+}
+
+@Article{Nadas88,
+  author =       "A. Nadas and D. Nahamoo and M. A. Picheny",
+  title =        "On a model-robust training method for speech
+                 recognition",
+  journal =      "IEEE Transactions on Acoustics, Speech and Signal
+                 Processing",
+  volume =       "ASSP-36",
+  number =       "9",
+  pages =        "1432--1436",
+  year =         "1988",
+}
+
+@Article{Nadeau-Bengio-2003,
+  author =       "Claude Nadeau and Yoshua Bengio",
+  title =        "Inference for the Generalization Error",
+  journal =      "Machine Learning",
+  volume =       "52",
+  number =       "3",
+  pages =        "239--281",
+  year =         "2003",
+}
+
+@Article{Nadeau-Bengio-2003-small,
+  author =       "Claude Nadeau and Yoshua Bengio",
+  title =        "Inference for the Generalization Error",
+  journal =      "Machine Learning",
+  volume =       "52(3)",
+  pages =        "239--281",
+  year =         "2003",
+}
+
+@InProceedings{Nadeau00-nips,
+  author =       "Claude Nadeau and Yoshua Bengio",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Inference for the Generalization Error",
+  publisher =    "MIT Press",
+  pages =        "307--313",
+  year =         "2000",
+}
+
+@InProceedings{Bonneville+al-1998,
+  author =       "M. Bonneville and J. Meunier and Y. Bengio and J.P. Soucy",
+  booktitle =    "SPIE Medical Imaging 1998",
+  title =        "Support Vector Machines for Improving the classification of Brain Pet Images",
+  address =      "San Diego",
+  year =         "1998",
+}
+
+@TechReport{Nadeau99-TR,
+  author =       "Claude Nadeau and Yoshua Bengio",
+  title =        "Inference for the Generalization Error",
+  institution =  "CIRANO",
+  address =      "Montreal, Quebec, Canada",
+  year =         "1999",
+}
+
+@InProceedings{nag86,
+  author =       "R. Nag and K. H. Wong and F. Fallside",
+  booktitle =    icassp,
+  title =        "Script recognition using hidden {Markov} models",
+  address =      "Tokyo",
+  pages =        "2071--2074",
+  year =         "1986",
+}
+
+@MastersThesis{Nahm-2005,
+ author = {E. Nahm},
+ title = {Classification models for transactional graph data},
+ school = {Department of Mathematics and Statistics, Acadia University},
+ year = 2005,
+}
+
+@article{Naka-Rushton-1966a,
+ author = {K.I. Naka and W.A.H. Rushton},
+ year = 1966,
+ title = {{S}-potentials from colour units in the retina of fish (Cyprinidae)},
+ journal = {J. Physiol.}, 
+ volume = 185, 
+ pages = {536-–555},
+}
+
+@article{Naka-Rushton-1966b,
+ author = {K.I. Naka and W.A.H. Rushton},
+ year = 1966,
+ title = {An attempt to analyse colour perception by electrophysiology},
+ journal = {J. Physiol.}, 
+ volume = 185, 
+ pages = {556–586},
+}
+
+
+@InProceedings{NakagawaT04,
+  author =       "Tetsuji Nakagawa and Taku Kudoh and Yuji Matsumoto",
+  booktitle =    "Proceedings of the Sixth Natural Language Processing
+                 Pacific Rim Symposium",
+  title =        "Unknown Word Guessing and Part-of-Speech Tagging Using
+                 Support Vector Machines",
+  address =      "Tokyo, Japan",
+  pages =        "325--331",
+  year =         "2001",
+}
+
+@Article{Naradraya70,
+  author =       "E. A. Nadaraya",
+  title =        "Remarks on nonparametric estimates for density
+                 functions and regression curves",
+  journal =      "Theory of Probability and its Applications",
+  volume =       "15",
+  pages =        "134--137",
+  year =         "1970",
+}
+
+@Book{Narendra89,
+  author =       "K. Narendra and M. A. L. Thathachar",
+  title =        "Learning Automata: An Introduction",
+  publisher =    "Prentice-Hall",
+  address =      "Englewood Cliffs",
+  year =         "1989",
+}
+
+@Book{narendra:1989,
+  author =       "K. S. Narendra and M. A. L. Thathachar",
+  title =        "Learning Automata: an introduction",
+  publisher =    "Prentice Hall",
+  year =         "1989",
+}
+
+@Article{Nasrabadi88a,
+  author =       "N. M. Nasrabadi and R. A. King",
+  title =        "Image Coding Using Vector Quantization: {A} Review",
+  journal =      ieeetcomm,
+  volume =       "36",
+  pages =        "957--971",
+  year =         "1988",
+}
+
+@InProceedings{Nasrabadi88b,
+  author =       "N. M. Nasrabadi and Y. Feng",
+  booktitle =    icnn,
+  title =        "Vector Quantization of Images Based upon the Kohonen
+                 Self-Organizing Feature Maps",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "101--108",
+  year =         "1988",
+}
+
+@Article{Nass75,
+  author =       "M. M. Nass and L. N. Cooper",
+  title =        "A Theory for the Development of Feature Detecting
+                 Cells in Visual Cortex",
+  journal =      biocyb,
+  volume =       "19",
+  pages =        "1--18",
+  year =         "1975",
+}
+
+@Article{Naylor88,
+  author =       "J. Naylor and K. P. Li",
+  title =        "Analysis of a Neural Network Algorithm for Vector
+                 Quantization of Speech Parameters",
+  journal =      nnsupp,
+  volume =       "1",
+  pages =        "310",
+  year =         "1988",
+}
+
+@Article{NC:Baldi93,
+  author =       "P. Baldi and Y. Chauvin",
+  title =        "Neural Networks for Fingerprint Recognition",
+  journal =      "Neural Computation",
+  volume =       "5",
+  type =         "Letter",
+  number =       "3",
+  pages =        "402--418",
+  year =         "1993",
+}
+
+@Article{nc:Geman+Bienenstock+Doursat:1992,
+  author =       "S. Geman and E. Bienenstock and R. Doursat",
+  title =        "Neural Networks and the Bias/Variance Dilemma",
+  journal =      "Neural Computation",
+  volume =       "4",
+  type =         "View",
+  number =       "1",
+  pages =        "1--58",
+  year =         "1992",
+}
+
+@Article{nc:Poggio+Girosi:1998,
+  author =       "Tomaso Poggio and Frederico Girosi",
+  title =        "A Sparse Representation for Function Approximation",
+  journal =      "Neural Computation",
+  volume =       "10",
+  number =       "6",
+  pages =        "1445--1454",
+  year =         "1998",
+}
+
+@TechReport{Neal-GP97,
+  author =       "Radford M. Neal",
+  title =        "Monte Carlo implementation of {G}aussian process models
+                 for {Bayesian} regression and classification",
+  number =       "9702",
+  institution =  "University of Toronto, Department of Statistics",
+  year =         "1997",
+}
+
+@Article{Neal92,
+  author =       "Radford M. Neal",
+  title =        "Connectionist learning of belief networks",
+  journal =      "Artificial Intelligence",
+  volume =       "56",
+  pages =        "71--113",
+  year =         "1992",
+}
+
+@InProceedings{Neal93a,
+  author =       "Radford M. Neal",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Bayesian learning via stochastic dynamics",
+  address =      "Denver, CO",
+  pages =        "475--482",
+  year =         "1993",
+}
+
+@TechReport{Neal93b,
+  author =       "Radford M. Neal",
+  title =        "Probabilistic inference using {Markov} chain
+                 {Monte-Carlo} methods",
+  number =       "{CRG-TR}-93-1",
+  institution =  "Dept. of Computer Science, University of Toronto",
+  year =         "1993",
+}
+
+@PhdThesis{Neal94,
+  author =       "Radford M. Neal",
+  title =        "Bayesian Learning for Neural Networks",
+  school =       "Dept. of Computer Science, University of Toronto",
+  year =         "1994",
+}
+
+@TechReport{Neal94b,
+  author =       "Radford M. Neal",
+  title =        "Sampling from Multimodal Distributions Using Tempered Transitions",
+  number =       "9421",
+  institution =  "Dept. of Statistics, University of Toronto",
+  year =         "1994",
+}
+
+@InCollection{Neal98,
+  author =       "Radford M. Neal",
+  editor =       "C. M. Bishop",
+  booktitle =    "Neural Networks and Machine Learning",
+  title =        "Assessing relevance determination methods using
+                 {DELVE}",
+  publisher =    "Springer-Verlag",
+  pages =        "97--129",
+  year =         1998,
+}
+
+@Misc{neal98assessing,
+  author =       "Radford M. Neal",
+  title =        "Assessing Relevance Determination Methods Using
+                 {DELVE} Generalization in Neural Networks and Machine
+                 Learning",
+  year =         "1998",
+  text =         "Neal, R. N. (1998). Assessing Relevance Determination
+                 Methods Using DELVE Generalization in Neural Networks
+                 and Machine Learning, C. M. Bishop (editor),
+                 SpringerVerlag.",
+}
+
+@article{Neal-2001,
+  author =      "Radford M. Neal",
+  journal =     "Statistics and Computing",
+  month =       "April",
+  number =      "2",
+  pages =       "125--139",
+  title =       "Annealed importance sampling",
+  url =         "http://dx.doi.org/10.1023/A:1008923215028",
+  volume =      "11",
+  year =        "2001"
+}
+
+@Article{Needleman+Wunsch70,
+  author =       "S. B. Needleman and C. D. Wunsch",
+  title =        "A general method applicable to the search of
+                 similarities in the amino acid sequence of two
+                 proteins",
+  journal =      "Journal of Molecular Biology",
+  volume =       "48",
+  pages =        "443--453",
+  year =         "1970",
+}
+
+@Article{NeweyWest1987,
+  author =       "W. Newey and K. West",
+  title =        "A Simple, Positive Semi-Definite, Heteroscedasticity
+                 and Autocorrelation Consistent Covariance Matrix",
+  journal =      "Econometrica",
+  volume =       "55",
+  pages =        "703--708",
+  year =         "1987",
+}
+
+@InProceedings{Ney+Kneser93,
+  author =       "Hermann Ney and Reinhard Kneser",
+  booktitle =    "European Conference on Speech Communication and
+                 Technology (Eurospeech)",
+  title =        "Improved clustering techniques for class-based
+                 statistical language modelling",
+  address =      "Berlin",
+  pages =        "973--976",
+  year =         "1993",
+}
+
+@Article{Ney92,
+  author =       "H. Ney and D. Mergel and A. Noll and A. Paesler",
+  title =        "Data driven search organization for continuous speech
+                 recognition",
+  journal =      "IEEE Transactions on Signal Processing",
+  volume =       "40",
+  number =       "2",
+  pages =        "272--281",
+  month =        feb,
+  year =         "1992",
+}
+
+@InProceedings{Ng1996,
+  author =       "Hwee Tou Ng and Hian Beng Lee",
+  editor =       "Arivind Joshi and Martha Palmer",
+  booktitle =    "Proceedings of the Thirty-Fourth Annual Meeting of the
+                 Association for Computational Linguistics",
+  title =        "Integrating Multiple Knowledge Sources to Disambiguate
+                 Word Sense: An Exemplar-Based Approach",
+  publisher =    "Morgan Kaufmann Publishers",
+  address =      "San Francisco",
+  pages =        "40--47",
+  year =         "1996",
+  URL =          "citeseer.nj.nec.com/ng96integrating.html",
+}
+
+@InProceedings{Ng1997,
+  author =       "Hwee Tou Ng",
+  booktitle =    SIGLEX97,
+  title =        "Getting Serious about Word Sense Disambiguation",
+  address =      "Washington",
+  pages =        "1--7",
+  year =         "1997",
+}
+
+@InProceedings{Ng2002,
+  author =       "Andrew Y. Ng and Michael I. Jordan and Yair Weiss",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "On Spectral Clustering: analysis and an algorithm",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+  original =     "orig/AA35.ps",
+}
+
+@InProceedings{Ng2008,
+  author =       "Honglak Lee and Ekanadham Chaitanya and Andrew Y. Ng",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Sparse deep belief net model for visual area {V2}",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2008",
+}
+
+@InProceedings{NgJ02,
+  author =       "Andrew Y. Ng and Michael I. Jordan",
+  booktitle =    NIPS14,
+  editor =       NIPS14ed,
+  title =        {On Discriminative vs. Generative Classifiers: A
+                 comparison of logistic regression and naive Bayes},
+  pages =        "841--848",
+  year =         "2002",
+}
+
+%%Fred I deprecate the following as the tag name have the year of the conf and not of the papers!
+@InProceedings{NgJ01,
+  author =       "Andrew Y. Ng and Michael I. Jordan",
+  booktitle =    NIPS14,
+  editor =       NIPS14ed,
+  title =        {On Discriminative vs. Generative Classifiers: A
+                 comparison of logistic regression and naive Bayes},
+  pages =        "841--848",
+  year =         "2002",
+}
+
+@InProceedings{Nie99,
+  author =       "J. Y. Nie and M. Simard and P. Isabelle and R.
+                 Durand",
+  booktitle =    "22nd ACM-SIGIR",
+  title =        "Cross-Language Information Retrieval based on Parallel
+                 Texts and Automatic Mining of Parallel Texts in the
+                 Web",
+  address =      "Berkeley",
+  pages =        "74--81",
+  year =         "1999",
+}
+
+@INPROCEEDINGS{Niebles+Fei-Fei-2007,
+  AUTHOR =       "Niebles, J.C. and Fei-Fei, L.",
+  TITLE =        "A hierarchical model of shape and appearance for human action classification. ",
+  BOOKTITLE =    cvpr07,
+  YEAR =         "2007",
+}
+
+@Article{Nielsen96,
+  author =       "H. Nielsen and J. Engelbrecht and G. {von Heijne} and
+                 S. Brunak",
+  title =        "Defining a similarity threshold for a functional
+                 protein sequence pattern: the signal peptide cleavage
+                 site",
+  journal =      "Proteins",
+  pages =        "316--320",
+  year =         "1996",
+  volme =        "24",
+}
+
+@Article{Nielsen97,
+  author =       "H. Nielsen and J. Engelbrecht and S. Brunak and G.
+                 {von Heijne}",
+  title =        "Identification of prokaryotic and eukaryotic signal
+                 peptides and prediction of their cleavage sites",
+  journal =      "Prot. Eng.",
+  pages =        "1--6",
+  year =         "1997",
+  volme =        "10",
+}
+
+@InProceedings{Niesler98,
+  author =       "T. R. Niesler and E. W. D. Whittaker and P. C.
+                 Woodland",
+  booktitle =    icassp,
+  title =        "Comparison of part-of-speech and automatically derived
+                 category-based language models for speech recognition",
+  pages =        "177--180",
+  year =         "1998",
+}
+
+@InProceedings{Niles90,
+  author =       "L. T. Niles and H. F. Silverman",
+  booktitle =    icassp,
+  title =        "Combining Hidden {Markov} Models and Neural Network
+                 Classifiers",
+  address =      "Albuquerque, NM",
+  pages =        "417--420",
+  year =         "1990",
+}
+
+@Book{Nilsson-65,
+  author =       "N. J. Nilsson",
+  title =        "Learning Machines",
+  publisher =    "McGraw-Hill",
+  address =      "New York",
+  year =         "1965",
+}
+
+@Book{Nilsson-71,
+  author =       "N. J. Nilsson",
+  title =        "Problem-Solving Methods in Artificial Intelligence",
+  publisher =    "McGraw-Hill",
+  address =      "New York",
+  year =         "1971",
+}
+
+@InProceedings{nips-10:Baxter+Bartlett:1998,
+  author =       "Jonathan Baxter and Peter Bartlett",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "The Canonical Distortion Measure in Feature Space and
+                 1-{NN} Classification",
+  publisher =    "MIT Press",
+  year =         "1998",
+}
+
+@InProceedings{nips-10:Holger+Yoshua:1998,
+  author =       "Holger Schwenk and Yoshua Bengio",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "Training Methods for Adaptive Boosting of Neural
+                 Networks",
+  publisher =    "MIT Press",
+  pages =        "647--653",
+  year =         "1998",
+}
+
+@InProceedings{nips-6:Perrone:1994,
+  author =       "Michael P. Perrone",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Putting It All Together: Methods for Combining Neural
+                 Networks",
+  publisher =    "Morgan Kaufmann Publishers, Inc.",
+  pages =        "1188--1189",
+  year =         "1994",
+}
+
+@InProceedings{nips-9:Burges+Schoelkopf:1997,
+  author =       "Chris J. C. Burges and B. Sch{\"o}lkopf",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "Improving the Accuracy and Speed of Support Vector
+                 Machines",
+  publisher =    "MIT Press",
+  pages =        "375",
+  year =         "1997",
+}
+
+@InProceedings{nips02-LT09,
+  author =       "G. Lebanon and J. Lafferty",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Boosting and Maximum Likelihood for Exponential
+                 Models",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+  original =     "orig/LT09.ps",
+}
+
+@InCollection{NIPS2005-207,
+  author =       "Jian Zhang and Zoubin Ghahramani and Yiming Yang",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Learning Multiple Related Tasks using Latent
+                 Independent Component Analysis",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "1587--1594",
+  year =         "2006",
+}
+
+@InCollection{NIPS2007-812-small,
+  author =       "Nicolas Chapados and Yoshua Bengio",
+  booktitle =    "NIPS 20",
+  title =        "Augmented Functional Time Series Representation and
+                 Forecasting with {G}aussian Processes",
+  pages =        "265--272",
+  year =         "2008",
+}
+
+@InCollection{NIPS2007-925-small,
+  author =       "Nicolas {Le Roux} and Yoshua Bengio and Pascal Lamblin
+                 and Marc Joliveau and Balazs Kegl",
+  booktitle =    "NIPS 20",
+  title =        "Learning the 2-{D} Topology of Images",
+  pages =        "841--848",
+  year =         "2008",
+}
+
+@InProceedings{NIPS8:Drucker:AdaBoost-Trees,
+  author =       "Harris Drucker and Corinna Cortes",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Boosting decision trees",
+  publisher =    "MIT Press",
+  pages =        "479--485",
+  year =         "1996",
+}
+
+@InProceedings{NIPS8:Hofmann-Tresp,
+  author =       "Reimar Hofmann and Volker Tresp",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Discovering structure in continuous variables using
+                 {Bayesian} networks",
+  publisher =    "MIT Press",
+  pages =        "500--506",
+  year =         "1996",
+}
+
+@InProceedings{NIPS9:Monti-Cooper,
+  author =       "Stefano Monti and Gregory F. Cooper",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "Learning {Bayesian} belief networks with neural
+                 network estimators",
+  publisher =    "MIT Press",
+  pages =        "578--584",
+  year =         "1997",
+}
+
+@Article{Niranjan90,
+  author =       "M. Niranjan and F. Fallside",
+  title =        "Neural Networks and Radial Basis Functions in
+                 Classifying Static Speech Patterns",
+  journal =      cspla,
+  volume =       "4",
+  pages =        "275--289",
+  year =         "1990",
+}
+
+@Article{Nishimori90,
+  author =       "H. Nishimori and T. Nakamura and M. Shiino",
+  title =        "Retrieval of Spatio-Temporal Sequence in Asynchronous
+                 Neural Network",
+  journal =      prA,
+  volume =       "41",
+  pages =        "3346--3354",
+  year =         "1990",
+}
+
+@book{Nixon+Aguado+2007,
+    author = {Nixon, M. S.  and Aguado, A. S. },
+    publisher = {Academic Press},
+    edition = 2,
+    title = {Feature Extraction and Image Processing},
+    year = {2007}
+}
+
+@Article{nonparametric-LZ-95,
+  author =       "G. Lugosi and K. Xeger",
+  title =        "Nonparametric Estimation via Empirical Risk
+                 Minimization",
+  journal =      "IEEE Trans. on Information Theory",
+  volume =       "41",
+  number =       "3",
+  pages =        "677--687",
+  year =         "1995",
+}
+
+@Article{nonparametric-SK-96,
+  author =       "M. Smith and R. Kohn",
+  title =        "Nonparametric regression using {Bayesian} variable
+                 selection",
+  journal =      "J.Econometrics",
+  volume =       "75",
+  pages =        "317--344",
+  year =         "1996",
+}
+
+@InProceedings{nonparametric-W-91,
+  author =       "H. White",
+  booktitle =    "Proceedings of 23rd Symposium on the Interface,
+                 Computer Science and Statistics",
+  title =        "Nonparametric Estimation of Conditional Quantiles
+                 Using Neural Networks",
+  publisher =    "New-York: Springer-Verlag",
+  pages =        "190--199",
+  year =         "1992",
+}
+
+@Article{NordStrom,
+  author =       "T. Nordstrom and B. Svensson",
+  title =        "Using and Designing Massively Parallel Computers for
+                 Artificial Neural Networks",
+  journal =      "Journal of Parallel and Distributed Computing",
+  volume =       "3",
+  number =       "14",
+  pages =        "260--285",
+  year =         "1992",
+  OPTnote =      "",
+}
+
+@Article{Normandin94,
+  author =       "Y. Normandin and R. Cardin and R. {DeMori}",
+  title =        "High-performance connected digit recognition using
+                 maximum mutual information estimation",
+  journal =      "Transactions on Speech and Audio Processing",
+  volume =       "2",
+  number =       "2",
+  pages =        "299--311",
+  year =         "1994",
+}
+
+@InProceedings{Nowlan-nips90,
+  author =       "S. J. Nowlan",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Maximum Likelihood Competitive Learning",
+  publisher =    "Morgan Kaufman Publishers",
+  address =      "San Mateo, CA",
+  pages =        "574--582",
+  year =         "1990",
+}
+
+@InProceedings{Nowlan-nips92,
+  author =       "S. J. Nowlan and G. E. Hinton",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Adaptive Soft Weight Tying using {G}aussian Mixtures",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "993--1000",
+  year =         "1992",
+}
+
+@PhdThesis{Nowlan-PhD,
+  author =       "S. J. Nowlan",
+  title =        "Soft Competitive Adaptation: Neural Network Learning
+                 Algorithms based on Fitting Statistical Mixtures",
+  type =         "{C}{M}{U}-{C}{S}-91-126",
+  school =       "School of Computer Science, Carnegie Mellon
+                 University",
+  address =      "Pittsburgh, PA",
+  month =        apr # " 14",
+  year =         "1991",
+}
+
+@Article{Nowlan88,
+  author =       "S. J. Nowlan",
+  title =        "Gain Variation in Recurrent Error Propagation
+                 Networks",
+  journal =      cs,
+  volume =       "2",
+  pages =        "305--320",
+  year =         "1988",
+}
+
+@TechReport{Nowlan90,
+  author =       "S. J. Nowlan",
+  key =          "Nowlan",
+  title =        "Competing Experts: {An} experimental investigation of
+                 associative mixture models",
+  type =         "Technical Report",
+  number =       "CRG-TR-90-5",
+  institution =  "University of Toronto",
+  year =         "1990",
+  annote =       "In CRG Library",
+}
+
+@Article{Nowlan92,
+  author =       "S. J. Nowlan and G. E. Hinton",
+  title =        "Simplifying Neural Networks by Soft Weight-Sharing",
+  journal =      "Neural Computation",
+  volume =       "4",
+  type =         "Letter",
+  number =       "4",
+  pages =        "473--493",
+  year =         "1992",
+}
+
+@InProceedings{nsvnijcnn,
+  author =       "Pascal Vincent and Yoshua Bengio",
+  booktitle =    ijcnn,
+  title =        "A Neural Support Vector Network Architecture with
+                 Adaptive Kernels",
+  volume =       "5",
+  pages =        "5187--5192",
+  year =         "2000",
+}
+
+@Book{NumOptBook,
+  author =       "J. Nocedal and S. Wright",
+  title =        "Numerical Optimization",
+  publisher =    "Springer",
+  year =         "2006",
+}
+
+@Article{Nystrom-1928,
+  author =       "E. J. Nystr{\"o}m",
+  title =        "{\"{U}}ber die praktische aufl{\"o}sung von linearen
+                 integralgleichungen mit anwendungen auf
+                 randwertaufgaben der potentialtheorie",
+  journal =      "Commentationes Physico-Mathematicae",
+  volume =       "4",
+  number =       "15",
+  pages =        "1--52",
+  year =         "1928",
+}
+
+@Book{O'Shaughnessy87,
+  author =       "D. O'Shaughnessy",
+  title =        "Speech Communication --- Human and Machine",
+  publisher =    "Addison-Wesley",
+  year =         "1987",
+}
+
+@Article{Oja82,
+  author =       "E. Oja",
+  title =        "A Simplified Neuron Model As a Principal Component
+                 Analyzer",
+  journal =      jmathb,
+  volume =       "15",
+  pages =        "267--273",
+  year =         "1982",
+}
+
+@Article{Oja85,
+  author =       "E. Oja and J. Karhunen",
+  title =        "On Stochastic Approximation of the Eigenvectors and
+                 Eigenvalues of the Expectation of a Random Matrix",
+  journal =      jama,
+  volume =       "106",
+  pages =        "69--84",
+  year =         "1985",
+}
+
+@Article{Oja89,
+  author =       "E. Oja",
+  title =        "Neural Networks, Principal Components, and Subspaces",
+  journal =      "International Journal of Neural Systems",
+  volume =       "1",
+  pages =        "61--68",
+  year =         "1989",
+}
+
+@Article{Olshausen+Field-1996,
+  author =       "Bruno A. Olshausen and David J. Field",
+  title =        {Emergence of simple-cell receptive field properties by learning a sparse code for natural images},
+  journal =      "Nature",
+  volume =       381,
+  pages =        {607--609},
+  year =         "1996",
+}
+
+@Article{Olshausen-97,
+  author =       "B. A. Olshausen and D. J. Field",
+  title =        "Sparse coding with an overcomplete basis set: a
+                 strategy employed by {V}1?",
+  journal =      "Vision Research",
+  volume =       "37",
+  pages =        "3311--3325",
+  year =         "1997",
+  url =          {http://view.ncbi.nlm.nih.gov/pubmed/9425546},
+  keywords = {sparse-coding, v1, vision},
+  month = {December},
+}
+
+@article{olshausen:2005,
+    author = {Bruno Olshausen and David J. Field},
+    title = {How Close are We to Understanding {V1}?},
+    journal = {Neural Computation},
+    volume = {17},
+    pages = {1665-1699},
+    year = {2005},
+}
+
+
+@InProceedings{Omlin-ml92,
+  author =       "C. W. Omlin and C. L. Giles",
+  editor =       "D. Sleeman and P. Edwards",
+  booktitle =    "Machine Learning: Proc. of the Ninth Int. Conference",
+  title =        "Training Second-Order Recurrent Neural Networks using
+                 Hints",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  year =         "1992",
+}
+
+@InProceedings{Omohundro96,
+  author =       "S. Omohundro",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Family Discovery",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@InProceedings{Ong-Smola-2003,
+  author =       "C. S. Ong and A. J. Smola",
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  title =        "Machine learning using hyperkernels",
+  year =         "2003",
+}
+
+@Article{Opper90,
+  author =       "M. Opper and W. Kinzel and J. Kleinz and R. Nehl",
+  title =        "On the Ability of the Optimal Perceptron to
+                 Generalize",
+  journal =      jpa,
+  volume =       "23",
+  pages =        "L581--L586",
+  year =         "1990",
+}
+
+@Article{Orland85,
+  author =       "H. Orland",
+  title =        "Mean-Field Theory for Optimization Problems",
+  journal =      jppl,
+  volume =       "46",
+  pages =        "763--770",
+  year =         "1985",
+}
+
+@InProceedings{ormo-nips99,
+  author =       "D. Ormoneit and T. Hastie",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Optimal Kernel Shapes for Local Linear Regression",
+  publisher =    "MIT Press",
+  year =         "2000",
+}
+
+@Article{Orponen94,
+  author =       "Pekka Orponen",
+  title =        "Computational complexity of neural networks: a
+                 survey",
+  journal =      "Nordic Journal of Computing",
+  volume =       "1",
+  number =       "1",
+  pages =        "94--110",
+  month =        "Spring",
+  year =         "1994",
+  URL =          "citeseer.ist.psu.edu/article/orponen95computational.html",
+}
+
+@Book{Ortega70,
+  author =       "J. M. Ortega and W. C. Rheinboldt",
+  title =        "Iterative Solution of Non-linear Equations in Several
+                 Variables and Systems",
+  publisher =    "Academic Press",
+  address =      "New York",
+  year =         "1970",
+  OPTnote =      "",
+}
+
+@Book{Ortega70a,
+  author =       "J. M. Ortega and W. C. Rheinboldt",
+  title =        "Iterative Solution of Non-linear Equations in Several
+                 Variables and Systems",
+  publisher =    "Academic Press",
+  address =      "New York",
+  year =         "1970",
+}
+
+@InProceedings{Osindero+Hinton-2008,
+  author =       "Simon Osindero and Geoffrey E. Hinton",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        {Modeling image patches with a directed hierarchy of
+                 Markov random field},
+  publisher =    {MIT Press},
+  address =      {Cambridge, MA},
+  pages =        {1121--1128},
+  year =         "2008",
+}
+
+@InProceedings{Osindero+Hinton-2008-small,
+  author =       "S. Osindero and G. Hinton",
+  booktitle =    "NIPS 20",
+  title =        {Modeling image patches with a directed hierarchy of
+                 Markov random field},
+  year =         "2008",
+}
+
+@Article{Osindero+Welling+Hinton-05,
+  author =       "Simon Osindero and Max Welling and Geoffrey E. Hinton",
+  title =        "Topographic Product Models Applied To Natural Scene
+                 Statistics",
+  journal =      "Neural Computation",
+  volume =       "18",
+  pages =        "381--344",
+  year =         "2005",
+}
+
+@Article{OsinderoS2006,
+  author =       "Simon Osindero and Max Welling and Geoffrey E.
+                 Hinton",
+  title =        "Topographic Product Models Applied to Natural Scene
+                 Statistics",
+  journal =      "Neural Computation",
+  volume =       "18",
+  number =       "2",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA, USA",
+  pages =        "381--414",
+  year =         "2006",
+  ISSN =         "0899-7667",
+}
+
+@Article{OsinderoS2006-small,
+  author =       "Simon Osindero and Max Welling and Geoffrey E. Hinton",
+  title =        "Topographic Product Models Applied to Natural Scene
+                 Statistics",
+  journal =      "Neural Computation",
+  volume =       "18",
+  number =       "2",
+  pages =        "381--414",
+  year =         "2006",
+}
+
+@InProceedings{Ott76,
+  author =       "R. Ott",
+  booktitle =    "Third International Joint Conference on Pattern
+                 Recognition",
+  title =        "Construction of quadratic polynomial classifiers",
+  publisher =    "IEEE, CA",
+  address =      "Coronado, CA",
+  pages =        "161--165",
+  year =         "1976",
+}
+
+@article{OttJ1976b,
+     title = {Some Classification Procedures for Multivariate Binary Data Using Orthogonal Functions},
+     author = {Ott, Jurg and Kronmal, Richard A.},
+     journal = {Journal of the American Statistical Association},
+     volume = {71},
+     number = {354},
+     pages = {391--399},
+     year = {1976},
+     publisher = {American Statistical Association},    
+     copyright = {Copyright © 1976 American Statistical Association},
+    }
+
+
+@InProceedings{Ouimet+Bengio-2005,
+  author =       "Marie Ouimet and Yoshua Bengio",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "Greedy Spectral Embedding",
+  publisher =    "",
+  date =         "Jan 6-8, 2005",
+  location =     "Savannah Hotel, Barbados",
+  pages =        "253--260",
+  year =         "2005",
+}
+
+@InProceedings{Owens89,
+  author =       "A. J. Owens and D. L. Filkin",
+  booktitle =    ijcnn,
+  title =        "Efficient Training of the Back Propagation Network by
+                 Solving a System of Stiff Ordinary Differential
+                 Equations",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "381--386",
+  year =         "1989",
+}
+
+@InProceedings{Paccanaro2000,
+  author =       "A. Paccanaro and G. E. Hinton",
+  booktitle =    ijcnn,
+  title =        "Extracting Distributed Representations of Concepts and
+                 Relations from Positive and Negative Propositions",
+  publisher =    "IEEE, New York",
+  address =      "Como, Italy",
+  year =         "2000",
+}
+
+@Article{Packard80,
+  author =       "N. H. Packard and J. P Crutchfield and J. D. Farmer
+                 and R. S. Shaw",
+  title =        "Geometry from a Time Series",
+  journal =      prl,
+  volume =       "45",
+  pages =        "712--716",
+  year =         "1980",
+}
+
+@misc{Pal+al-2006,
+    author = {Chris Pal and Michael Kelm and Xuerui Wang and Greg Druck and Andrew McCallum},
+    title = {On Discriminative and Semi-Supervised Dimensionality Reduction},
+    year = {2006},
+    note = {Workshop on Novel Applications of Dimensionality Reduction, NIPS'06},
+}
+
+@InCollection{Palmer88,
+  author =       "R. G. Palmer",
+  editor =       "P. W. Anderson and K. J. Arrow and D. Pines",
+  booktitle =    "The Economy As an Evolving Complex System",
+  title =        "Statistical Mechanics Approaches to Complex
+                 Optimization Problems",
+  volume =       "5",
+  publisher =    "Addison-Wesley",
+  address =      "Redwood City",
+  pages =        "177--193",
+  year =         "1988",
+  series =       "SFI Studies in the Sciences of Complexity:
+                 Proceedings",
+}
+
+@InCollection{Palmer89,
+  author =       "R. G. Palmer",
+  editor =       "D. L. Stein",
+  booktitle =    "Lectures in the Sciences of Complexity",
+  title =        "Neural Nets",
+  volume =       "1",
+  publisher =    "Addison-Wesley",
+  address =      "Redwood City",
+  pages =        "439--461",
+  year =         "1989",
+  series =       "SFI Studies in the Sciences of Complexity: Lectures",
+}
+
+@Book{Papadimitriou,
+  author =       "C. H. Papadimitriou",
+  title =        "Combinatorial Optimization: Algorithms and
+                 Complexity",
+  publisher =    "Prentice-Hall",
+  address =      "Englewood Cliffs, NJ",
+  year =         "1982",
+}
+
+@Book{Papadimitriou82,
+  author =       "C. H. Papadimitriou and K. Steiglitz",
+  title =        "Combinatorial Optimization: Algorithms and
+                 Complexity",
+  publisher =    "Prentice-Hall",
+  address =      "Englewood Cliffs",
+  year =         "1982",
+}
+
+@Article{Parga86,
+  author =       "N. Parga and M. A. Virasoro",
+  title =        "The Ultrametric Organization of Memories in a Neural
+                 Network",
+  journal =      jpp,
+  volume =       "47",
+  pages =        "1857--1864",
+  year =         "1986",
+}
+
+@Article{Parisi86,
+  author =       "G. Parisi",
+  title =        "Asymmetric Neural Networks and the Process of
+                 Learning",
+  journal =      jpa,
+  volume =       "19",
+  pages =        "L675--L680",
+  year =         "1986",
+}
+
+@Book{Parisi88,
+  author =       "G. Parisi",
+  title =        "Statistical Field Theory",
+  publisher =    "Addison-Wesley",
+  address =      "Redwood City, CA",
+  year =         "1988",
+}
+
+@Article{Park-nc91,
+  author =       "J. Park and I. W. Sandberg",
+  title =        "Universal Approximation Using Radial-Basis-Function
+                 Networks",
+  journal =      nc,
+  volume =       "3",
+  number =       "2",
+  pages =        "246--257",
+  year =         "1991",
+}
+
+@TechReport{Parker85,
+  author =       "D. B. Parker",
+  title =        "Learning Logic",
+  number =       "TR--47",
+  institution =  "Center for Computational Research in Economics and
+                 Management Science, Massachusetts Institute of
+                 Technology",
+  address =      "Cambridge, MA",
+  year =         "1985",
+}
+
+@InProceedings{Parker87,
+  author =       "D. B. Parker",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Optimal Algorithms for Adaptive Networks: Second Order
+                 Back Propagation, Second Order Direct Propagation, and
+                 Second Order Hebbian Learning",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "593--600",
+  year =         "1987",
+}
+
+@InProceedings{Parks87,
+  author =       "M. Parks",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Characterization of the {Boltzmann} Machine Learning
+                 Rate",
+  volume =       "3",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "715--719",
+  year =         "1987",
+}
+
+@Article{Parlos94,
+  author =       "A. G. Parlos and J. Muthusami and A. F. Atiya",
+  title =        "Incipient Fault Detection and Identification in
+                 Process Systems using Accelerated Neural Network
+                 Learning",
+  journal =      "Nuclear Technology",
+  volume =       "105",
+  pages =        "145",
+  year =         "1994",
+}
+
+@Article{Parzen62,
+  author =       "Emanuel Parzen",
+  title =        "On the estimation of a probability density function
+                 and mode",
+  journal =      "Annals of Mathematical Statistics",
+  volume =       "33",
+  pages =        "1064--1076",
+  year =         "1962",
+}
+
+@InProceedings{pati93orthogonal,
+  author =       "Y. Pati and R. Rezaiifar and P. Krishnaprasad",
+  booktitle =    "Proceedings of the 27 th Annual Asilomar Conference on
+                 Signals, Systems, and Computers",
+  title =        "Orthogonal Matching Pursuit: Recursive Function
+                 Approximation with Applications to Wavelet
+                 Decomposition",
+  pages =        "40--44",
+  month =        nov,
+  year =         "1993",
+}
+
+@InProceedings{Paugam-Moisy-1992,
+  author =       "H\'el\`ene {Paugam-Moisy}",
+  booktitle =    ijcnn,
+  title =        "On the Convergence of a Block-Gradient Algorithm for
+                 Back-Propagation Learning",
+  volume =       "3",
+  publisher =    "IEEE",
+  address =      "New York",
+  pages =        "919--924",
+  year =         "1992",
+}
+
+@InProceedings{Paugam-Moisy-1992b,
+  author =       "H\'{e}l\`{e}ne {Paugam-Moisy}",
+  booktitle =    "CONPAR '92/ VAPP V: Proceedings of the Second Joint
+                 International Conference on Vector and Parallel
+                 Processing",
+  title =        "Optimal Speedup Conditions for a Parallel
+                 Back-Propagation Algorithm",
+  publisher =    "Springer-Verlag",
+  address =      "London, UK",
+  pages =        "719--724",
+  year =         "1992",
+  ISBN =         "3-540-55895-0",
+}
+
+@InCollection{Paugam-Moisy-1993,
+  author =       "H\'el\`ene {Paugam-Moisy}",
+  editor =       "I. Pitas",
+  booktitle =    "Parallel Algorithms for Digital Image Processing,
+                 Computer Vision and Neural Networks",
+  title =        "Parallel Neural Computing Based on Network
+                 Duplicating",
+  publisher =    "John Wiley",
+  pages =        "305--340",
+  year =         "1993",
+}
+
+@inproceedings{Pavlovic-2001,
+ author = {Vladimir Pavlovic and James M. Rehg and John MacCormick},
+ title = {Learning Switching Linear Models of Human Motion},
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  publisher =    "{MIT} Press",
+  pages =        "981--987",
+  year =         "2001",
+}
+ 
+
+@Book{PdpManual,
+  author =       "D. E. Rumelhart and J. L. McClelland",
+  title =        "Exploration in Parallel Distributed Processing",
+  volume =       "3",
+  publisher =    "MIT Press",
+  year =         "1988",
+}
+
+@InProceedings{Pearl-Verma91,
+  author =       "J. Pearl and T. S. Verma",
+  editor =       "J. A. Allen and R. Fikes and and E. Sandewall",
+  booktitle =    "Principles of Knowledge Representation and Reasoning:
+                 Proceedings of the Second International Conference",
+  title =        "A theory of inferred causation",
+  publisher =    "Morgan Kaufmann, San Mateo, CA",
+  pages =        "441--452",
+  year =         "1991",
+}
+
+@Book{Pearl88,
+  author =       "Judea Pearl",
+  title =        "Probabilistic Reasoning in Intelligent Systems:
+                 Networks of Plausible Inference",
+  publisher =    "Morgan Kaufmann",
+  year =         "1988",
+}
+
+@InProceedings{Pearlmutter+Parra-96,
+  author =       "Barak Pearlmutter and L. C. Parra",
+  editor =       "L. Xu",
+  booktitle =    "International Conference On Neural Information
+                 Processing",
+  title =        "A context-sensitive generalization of {ICA}",
+  address =      "Hong-Kong",
+  pages =        {151--157},
+  year =         "1996",
+}
+
+@InProceedings{Pearlmutter86,
+  author =       "B. A. Pearlmutter and G. E. Hinton",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "{G}-Maximization: An Unsupervised Learning Procedure
+                 for Discovering Regularities",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "333--338",
+  year =         "1986",
+}
+
+@InProceedings{Pearlmutter89a,
+  author =       "B. A. Pearlmutter",
+  booktitle =    ijcnn,
+  title =        "Learning State Space Trajectories in Recurrent Neural
+                 Networks",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "365--372",
+  year =         "1989",
+}
+
+@Article{Pearlmutter89b,
+  author =       "B. A. Pearlmutter",
+  title =        "Learning State Space Trajectories in Recurrent Neural
+                 Networks",
+  journal =      nc,
+  volume =       "1",
+  pages =        "263--269",
+  year =         "1989",
+}
+
+@article{Pearson-1901,
+    author = {Pearson, K. },
+    citeulike-article-id = {2013414},
+    journal = {Philosophical Magazine},
+    keywords = {pca},
+    number = {6},
+    pages = {559--572},
+    posted-at = {2007-11-29 10:41:36},
+    priority = {2},
+    title = {On lines and planes of closest fit to systems of points in space},
+    volume = {2},
+    year = {1901}
+}
+
+@InProceedings{Pedersen2001,
+  author =       "Ted Pedersen",
+  booktitle =    "Proceedings of the Second Annual Meeting of the North
+                 American Chapter of the Association for Computational
+                 Linguistics",
+  title =        "A decision tree of bigrams is an accurate predictor of
+                 word sense",
+  pages =        "79--86",
+  year =         "2001",
+  URL =          "citeseer.nj.nec.com/pedersen01decision.html",
+}
+
+@InProceedings{Peeling86,
+  author =       "S. M. Peeling and R. K. Moore and M. J. Tomlinson",
+  booktitle =    "Proceedings of the 10th Autumn Conference on Speech
+                 and Hearing",
+  title =        "The Multi-Layer Perceptron as a Tool for Speech
+                 Pattern Processing Research",
+  year =         "1986",
+}
+
+@InProceedings{peng04accurate,
+  author =       "F. Peng and A. McCallum",
+  booktitle =    "Proceedings of Human Language Technology Conference /
+                 North American Chapter of the Association for
+                 Computational Linguistics annual meeting",
+  title =        "Accurate information extraction from research papers
+                 using conditional random fields",
+  pages =        "329--336",
+  year =         "2004",
+}
+
+@InProceedings{Pennacchiotti+Pantel-2006,
+  author =       "Marco Pennacchiotti and Patrick Pantel",
+  booktitle =    "Proceedings of the 21st International Conference on
+                 Computational Linguistics and 44th Annual Meeting of
+                 the ACL",
+  title =        "Ontologizing Semantic Relations",
+  address =      "Sydney",
+  pages =        "793--800",
+  year =         "2006",
+}
+
+@Article{Penrose55,
+  author =       "R. Penrose",
+  title =        "A generalized inverse for matrices",
+  journal =      "Proc. Cambridge Philos. Soc.",
+  volume =       "51",
+  pages =        "406--513",
+  year =         "1955",
+}
+
+@InProceedings{Pereira93,
+  author =       "F. Pereira and N. Tishby and L. Lee",
+  booktitle =    "30th Annual Meeting of the Association for
+                 Computational Linguistics",
+  title =        "Distributional Clustering of English Words",
+  address =      "Columbus, Ohio",
+  pages =        "183--190",
+  year =         "1993",
+}
+
+@InProceedings{Pereira94,
+  author =       "F. Pereira and M. Riley and R. Sproat",
+  booktitle =    "ARPA Natural Language Processing Workshop",
+  title =        "Weighted rational transductions and their application
+                 to human language processing",
+  year =         "1994",
+}
+
+@InCollection{Pereira97,
+  author =       "F. C. N. Pereira and M. D. Riley",
+  editor =       "Emmanuel Roche and Yves Schabes",
+  booktitle =    "Finite-State Language Processing",
+  title =        "Speech recognition by composition of weighted finite
+                 automata",
+  publisher =    "MIT Press, Cambridge, Massachussetts",
+  pages =        "431--453",
+  year =         "1997",
+}
+
+@Article{Peretto84,
+  author =       "P. Peretto",
+  title =        "Collective Properties of Neural Networks: {A}
+                 Statistical Physics Approach",
+  journal =      biocyb,
+  volume =       "50",
+  pages =        "51--62",
+  year =         "1984",
+}
+
+@InProceedings{Peretto86,
+  author =       "P. Peretto and J. J. Niez",
+  editor =       "E. Bienenstock and F. Fogelman-Souli\'e and G.
+                 Weisbuch",
+  booktitle =    "Disordered Systems and Biological Organization",
+  title =        "Collective Properties of Neural Networks",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Les Houches 1985",
+  pages =        "171--185",
+  year =         "1986",
+}
+
+@Article{Peretto88,
+  author =       "P. Peretto",
+  title =        "On Learning Rules and Memory Storage Abilities of
+                 Asymmetrical Neural Networks",
+  journal =      jpp,
+  volume =       "49",
+  pages =        "711--726",
+  year =         "1988",
+}
+
+@InProceedings{Perez+Rendell-1996,
+  author =       "Eduardo P\'erez and Larry A. Rendell",
+  booktitle =    ICML96,
+  editor =       ICML96ed,
+  publisher =    ICML96publ,
+  title =        "Learning Despite Concept Variation by Finding
+                 Structure in Attribute-based Data",
+  pages =        "391--399",
+  year =         "1996",
+}
+
+@Article{Perez75,
+  author =       "R. P\'erez and L. Glass and R. Shlaer",
+  title =        "Development of Specificity in the Cat Visual Cortex",
+  journal =      jmathb,
+  volume =       "1",
+  pages =        "275--288",
+  year =         "1975",
+}
+
+@MISC{Perez98markovrandom,
+  author = {Patrick Perez},
+  title = {Markov Random Fields and Images},
+  year = {1998}
+}
+
+@article{PerpinanM2000,
+ author = {Miguel \'{A}. Carreira-Perpi{\~{n}}\'{a}n and Steve \'{A}. Renals},
+ title = {Practical Identifiability of Finite Mixtures of Multivariate Bernoulli Distributions},
+ journal = {Neural Computation},
+ volume = {12},
+ number = {1},
+ year = {2000},
+ pages = {141--152},
+ publisher = {MIT Press},
+ address = {Cambridge, MA, USA},
+ }
+
+@InProceedings{Perpinan+Hinton-2005,
+  author =       "Miguel A. Carreira-Perpi{\~{n}}an and Geoffrey E. Hinton",
+  editor =       aistats05ed,
+  booktitle =    aistats05,
+  title =        "On Contrastive Divergence Learning",
+  publisher =    "Society for Artificial Intelligence and Statistics",
+  date =         "Jan 6-8, 2005",
+  location =     "Savannah Hotel, Barbados",
+  pages =        "33--40",
+  year =         "2005",
+}
+
+@Article{Personnaz85,
+  author =       "L. Personnaz and I. Guyon and G. Dreyfus",
+  title =        "Information Storage and Retrieval in Spin-Glass-Like
+                 Neural Networks",
+  journal =      jppl,
+  volume =       "46",
+  pages =        "359--365",
+  year =         "1985",
+}
+
+@Article{Personnaz86,
+  author =       "L. Personnaz and I. Guyon and G. Dreyfus",
+  title =        "Collective Computational Properties of Neural
+                 Networks: New Learning Mechanisms",
+  journal =      prA,
+  volume =       "34",
+  pages =        "4217--4228",
+  year =         "1986",
+}
+
+@Article{Peterson2004,
+  author =       "Gail B. Peterson",
+  title =        "A day of great illumination: {B. F.} {Skinner}'s
+                 discovery of shaping",
+  journal =      "Journal of the Experimental Analysis of Behavior",
+  volume =       "82",
+  number =       "3",
+  pages =        "317--328",
+  year =         "2004",
+}
+
+@Article{Peterson87,
+  author =       "C. Peterson and J. R. Anderson",
+  title =        "A Mean Field Theory Learning Algorithm for Neural
+                 Networks",
+  journal =      cs,
+  volume =       "1",
+  pages =        "995--1019",
+  year =         "1987",
+}
+
+@Article{Peterson89,
+  author =       "C. Peterson and B. S{\"o}derberg",
+  title =        "A New Method for Mapping Optimization Problems onto
+                 Neural Networks",
+  journal =      ijns,
+  volume =       "1",
+  pages =        "3--22",
+  year =         "1989",
+}
+
+@Article{Peterson90,
+  author =       "C. Peterson and S. Redfield and J. D. Keeler and E.
+                 Hartman",
+  title =        "An Optoelectronic Architecture for Multilayer Learning
+                 in a Single Photorefractive Crystal",
+  journal =      nc,
+  volume =       "2",
+  pages =        "25--34",
+  year =         "1990",
+}
+
+@PhdThesis{PhD:Perrone,
+  author =       "Michael P. Perrone",
+  title =        "Improving Regression Estimation: Averaging Methods for
+                 Variance Reduction with Extensions to General Conve
+                 Measure Optimization",
+  school =       "Brown University, Institute for Brain and Neural
+                 Systems",
+  month =        may,
+  year =         "1993",
+}
+
+@Book{Piaget1952,
+  author =       "J.-P. Piaget",
+  title =        "The origins of intelligence in children",
+  publisher =    "International Universities Press",
+  address =      "New York",
+  year =         "1952",
+}
+
+@Article{Pineda87,
+  author =       "F. J. Pineda",
+  title =        "Generalization of Back-Propagation to Recurrent Neural
+                 Networks",
+  journal =      prl,
+  volume =       "59",
+  pages =        "2229--2232",
+  year =         "1987",
+}
+
+@Article{Pineda88,
+  author =       "F. J. Pineda",
+  title =        "Dynamics and Architecture for Neural Computation",
+  journal =      jcomp,
+  volume =       "4",
+  pages =        "216--245",
+  year =         "1988",
+}
+
+@InProceedings{Pineda88-nips,
+  author =       "F. Pineda",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Generalization of Backpropagation to Recurrent and
+                 Higher Order Neural Networks",
+  organization = "American Institute of Physics",
+  address =      "New York, NY",
+  pages =        "602--611",
+  year =         "1988",
+}
+
+@Article{Pineda89,
+  author =       "F. J. Pineda",
+  title =        "Recurrent Back-Propagation and the Dynamical Approach
+                 to Adaptive Neural Computation",
+  journal =      nc,
+  volume =       "1",
+  pages =        "161--172",
+  year =         "1989",
+}
+
+@InCollection{PINN,
+  author =       "P. Frasconi and M. Gori and A. Tesi",
+  editor =       "O. Omidvar",
+  booktitle =    "Progress in Neural Networks",
+  title =        "Successes and Failures of Backpropagation: {A}
+                 Theoretical Investigation",
+  volume =       "5",
+  publisher =    "Ablex Publishing",
+  year =         "1993",
+}
+
+@article{Pinto08,
+  author = {Pinto, Nicolas AND Cox, David D AND DiCarlo, James J},
+  journal = {PLoS Comput Biol},
+  publisher = {Public Library of Science},
+  title = {Why is Real-World Visual Object Recognition Hard?},
+  year = {2008},
+  month = {01},
+  volume = {4},
+}        
+
+@inproceedings{Pinto-DiCarlo-2008,
+ author = {Nicolas Pinto and James {DiCarlo} and David Cox},
+ title = {Establishing Good Benchmarks and Baselines for Face Recognition},
+ booktitle = {ECCV 2008 Faces in 'Real-Life' Images Workshop},
+ year = 2008,
+address={{M}arseille {F}rance },
+organization={{E}rik {L}earned-{M}iller and {A}ndras {F}erencz and {F}r{\'e}d{\'e}ric {J}urie },
+audience={internationale },
+URL={http://hal.inria.fr/inria-00326732/en/},
+}
+
+@article{Pinto-2009,
+  author = {Pinto, Nicolas AND Doukhan, David AND DiCarlo, James J. AND Cox, David D.},
+  journal = {PLoS Comput Biol},
+  publisher = {Public Library of Science},
+  title = {A High-Throughput Screening Approach to Discovering Good Forms of Biologically Inspired Visual Representation},
+  year = {2009},
+  month = {11},
+  volume = {5},
+  pages = {e1000579},
+  number = {11},
+}        
+
+@InCollection{Platt2000,
+  author =       "J. Platt",
+  editor =       "A. Smola and P. Bartlett and B. Scholkopf and D.
+                 Schuurmans",
+  booktitle =    "Advances in Large Margin Classifiers",
+  title =        "Probabilities for support vector machines",
+  publisher =    "MIT press",
+  year =         "2000",
+}
+
+@Article{Platt91,
+  author =       "J. Platt",
+  title =        "A Resource-Allocating Network for Function
+                 Interpolation",
+  journal =      "Neural Computation",
+  volume =       "3",
+  type =         "Letter",
+  number =       "2",
+  pages =        "213--225",
+  year =         "1991",
+}
+
+@InProceedings{Platt94,
+  author =       "R. Wolf and J. Platt",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Postal address block location using a convolutional
+                 locator network",
+  pages =        "745--752",
+  year =         "1994",
+}
+
+@Article{Plaut-csl87,
+  author =       "D. C. Plaut and G. E. Hinton",
+  title =        "Learning Set of Filters Using Back-propagation",
+  journal =      cspla,
+  volume =       "2",
+  pages =        "35--61",
+  year =         "1987",
+}
+
+@TechReport{Plaut86,
+  author =       "D. Plaut and S. Nowlan and G. Hinton",
+  title =        "Experiments on Learning by Back-Propagation",
+  number =       "CMU--CS--86--126",
+  institution =  "Department of Computer Science, Carnegie Mellon
+                 University",
+  address =      "Pittsburgh, PA",
+  year =         "1986",
+}
+
+@Article{PLS-Frank-Friedman,
+  author =       "Ildiko E. Frank and Jerome H. Friedman",
+  title =        "A statistical view of some chemometrics regression
+                 tools",
+  journal =      "Technometrics",
+  volume =       "35",
+  number =       "2",
+  pages =        "109--148",
+  year =         "1993",
+}
+
+@Article{Podder-2006,
+  author =       "M. Podder and W. J. Welch and R. H. Zamar and S. J. S.
+                 J. Tebbutt",
+  title =        "Dynamic Variable Selection in {SNP} Genotype
+                 Autocalling from {APEX} Microarray Data",
+  journal =      "In revision for BMC Bioinformatics",
+  year =         "2006",
+}
+
+@Article{Poggio-ieee90,
+  author =       "T. Poggio and F. Girosi",
+  title =        "Networks for Approximation and Learning",
+  journal =      ieeeproc,
+  volume =       "78",
+  number =       "9",
+  pages =        "1481--1497",
+  year =         "1990",
+}
+
+@Article{Poggio75,
+  author =       "T. Poggio",
+  title =        "On Optimal NonLinear Associative Recall",
+  journal =      biocyb,
+  volume =       "19",
+  pages =        "201",
+  year =         "1975",
+}
+
+@Article{Poggio85,
+  author =       "T. Poggio and V. Torre and C. Koch",
+  title =        "Computational Vision and Regularization Theory",
+  journal =      "Nature",
+  volume =       "317",
+  number =       "26",
+  pages =        "314--319",
+  year =         "1985",
+}
+
+@TechReport{Poggio89,
+  author =       "T. Poggio and F. Girosi",
+  title =        "A theory of networks for approximation and learning",
+  number =       "1140",
+  institution =  "MIT AI Laboratory",
+  address =      "Cambridge, MA",
+  year =         "1989",
+}
+
+@Article{Poggio90,
+  author =       "T. Poggio and F. Girosi",
+  title =        "Regularization Algorithms for Learning That Are
+                 Equivalent to Multilayer Networks",
+  journal =      science,
+  volume =       "247",
+  pages =        "978--982",
+  year =         "1990",
+}
+
+@Article{Pollack90,
+  author =       "Jordan B. Pollack",
+  title =        "Recursive Distributed Representations",
+  journal =      "Artificial Intelligence",
+  volume =       "46",
+  number =       "1",
+  pages =        "77--105",
+  year =         "1990",
+}
+
+@Article{Pollack91,
+  author =       "Jordan B. Pollack",
+  title =        "The Induction of Dynamical Recognizers",
+  journal =      mlearn,
+  volume =       "7",
+  number =       "2",
+  pages =        "196--227",
+  year =         "1991",
+}
+
+@Book{Pollard84,
+  author =       "D. Pollard",
+  title =        "Convergence of stochastic processes",
+  publisher =    "Springer-Verlag",
+  address =      "New York, NY",
+  year =         "1984",
+}
+
+@InProceedings{Pollit91,
+  author =       "M. D. Pollit and J. Peck",
+  booktitle =    "Proc. 2nd Canadian Conf. on Computer Applications in
+                 the Mineral Industry",
+  title =        "Recent advances in lithological recognition based on
+                 rotary blasthole drill responses",
+  address =      "Vancouver, Canada",
+  year =         "1991",
+}
+
+@InProceedings{Pomerleau89,
+  author =       "D. A. Pomerleau",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "{ALVINN}: An Autonomous Land Vehicle in a Neural
+                 Network",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "305--313",
+  year =         "1989",
+}
+
+@TechReport{Pontil98,
+  author =       "M. Pontil and A. Verri",
+  title =        "Properties of Support Vector Machines",
+  number =       "AI Memo 1612",
+  institution =  "MIT",
+  year =         "1998",
+}
+
+@InProceedings{Poritz88,
+  author =       "A. B. Poritz",
+  booktitle =    "Proc. Int. Conf. Acoustics, Speech, and Signal
+                 Processing",
+  title =        "Hidden {Markov} models: a guided tour",
+  pages =        "7--13",
+  year =         "1988",
+}
+
+@InProceedings{Poston,
+  author =       "T. Poston and C. Lee and Y. Choie and Y. Kwon",
+  booktitle =    "Proc. of the IEEE-IJCNN91",
+  title =        "Local minima and Backpropagation",
+  address =      "Seattle, WA",
+  pages =        "173--176",
+  year =         "1991",
+}
+
+@InProceedings{Poston-ijcnn91,
+  author =       "T. Poston and C. Lee and Y. Choie and Y. Kwon",
+  booktitle =    ijcnn,
+  title =        "Local Minima and Backpropagation",
+  publisher =    "IEEE Press",
+  address =      "Seattle WA",
+  pages =        "173--176",
+  year =         "1991",
+}
+
+@Article{Poterba+Summers,
+  author =       "J. M. Poterba and L. H. Summers",
+  title =        "Mean Reversion in Stock Prices",
+  journal =      "Journal of Financial Economics",
+  volume =       "22",
+  pages =        "27--59",
+  year =         "1988",
+}
+
+@Article{potvin:1995:orsajc,
+  author =       "J.-Y. Potvin and S. Bengio",
+  title =        "The Vehicle Routing Problem with Time Windows - Part
+                 {II}: Genetic Search",
+  journal =      "{ORSA} Journal on Computing",
+  year =         "1995",
+}
+
+@Misc{powell87radial,
+  author =       "M. Powell",
+  title =        "Radial basis functions for multivariable
+                 interpolation: {A} review",
+  year =         "1987",
+  text =         "M. J. D. Powell. Radial basis functions for
+                 multivariable interpolation: A review. In J. C. Mason
+                 and M. G. Cox, editors, Algorithms for Approximation of
+                 Functions and Data, pages 143--167. Oxford University
+                 Press, 1987.",
+}
+
+@InProceedings{Pratt+Kamm91,
+  author =       "L. Y. Pratt and C. A. Kamm",
+  booktitle =    ijcnn,
+  title =        "Improving a phoneme classification neural network
+                 through problem decomposition",
+  volume =       "2",
+  publisher =    "IEEE Press",
+  address =      "Seattle WA",
+  pages =        "821--826",
+  year =         "1991",
+}
+
+@InProceedings{pratt93,
+  author =       "Lorien Y. Pratt",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Discriminability-Based Transfer between Neural
+                 Networks",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "204--211",
+  year =         "1993",
+}
+
+@Article{Presnell93,
+  author =       "S. R. Presnell and F. E. Cohen",
+  title =        "Artificial neural networks for pattern recognition in
+                 biochemical sequences",
+  journal =      "Ann. Rev. Biophys. Biomol. Struct.",
+  volume =       "22",
+  pages =        "283--298",
+  year =         "1993",
+}
+
+@Book{Press86,
+  author =       "W. H. Press and B. P. Flannery and S. A. Teukolsky and
+                 W. T. Vetterling",
+  title =        "Numerical Recipes",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge",
+  year =         "1986",
+}
+
+@Book{Press92,
+  author =       "W. H. Press and S. A. Teukolsky and W. T. Vetterling
+                 and B. P. Flannery",
+  title =        "Numerical Recipes in {C}: The art of scientific
+                 computing (2nd ed.)",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge",
+  year =         "1992",
+}
+
+@article{Priebe2005,
+ author = {C.E. Priebe and J.M. Conroy and D.J. Marchette and Y. park},
+ title = {Scan Statistics on Enron Graphs},
+ journal = {Computational and Mathematical Organization Theory},
+ volume = 11,
+ number = 3,
+ pages = {229--247},
+ month = {October},
+ year = 2005,
+ publisher = {Springer},
+}
+
+@Book{Priestley81,
+  author =       "M. B. Priestley",
+  title =        "Spectral Analysis and Time Series, Vol.1: Univariate
+                 Series",
+  publisher =    "Academic Press",
+  year =         "1981",
+}
+
+@Article{Principe92,
+  author =       "B. {de Vries} and J. C. Principe",
+  title =        "The gamma model -- {A} new neural net model for
+                 temporal processing",
+  journal =      nn,
+  volume =       "5",
+  pages =        "565--576",
+  year =         "1992",
+  OPTnote =      "",
+}
+
+@Article{Psa88a,
+  author =       "D. Psaltis and C. H. Park and J. Hong",
+  title =        "Higher Order Associative Memories and Their Optical
+                 Implementations",
+  journal =      "Neural Networks",
+  volume =       "1",
+  number =       "2",
+  pages =        "149--163",
+  year =         "1988",
+}
+
+@InProceedings{Psaltis89,
+  author =       "D. Psaltis and D. Brady and K. Hsu",
+  booktitle =    ijcnn,
+  title =        "Learning in optical neural computers",
+  volume =       "2",
+  address =      "Washington D.C.",
+  pages =        "72--75",
+  year =         "1989",
+}
+
+@TechReport{publication-an,
+  author =       "Tomaso Poggio and Frederico Girosi",
+  title =        "An Equivalence Between Sparse Approximation and
+                 Support Vector Machines",
+}
+
+@TechReport{publication-notes,
+  author =       "Tomaso Poggio and Frederico Girosi",
+  title =        "Notes on {PCA}, Regularization, Sparsity and Support
+                 Vector Machines",
+}
+
+@Article{Qian+Sejnowski88,
+  author =       "N. Qian and T. J. Sejnowski",
+  title =        "Predicting the secondary structure of globular
+                 proteins using neural network models",
+  journal =      "Journal of Molecular Biology",
+  volume =       "202",
+  pages =        "865--884",
+  year =         "1988",
+}
+
+@Article{Qian88a,
+  author =       "N. Qian and T. J. Sejnowski",
+  title =        "Predicting the Secondary Structure of Globular
+                 Proteins Using Neural Network Models",
+  journal =      jmolecb,
+  volume =       "202",
+  pages =        "865--884",
+  year =         "1988",
+}
+
+@InProceedings{Qian88b,
+  author =       "N. Qian and T. J. Sejnowski",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "Learning to Solve Random-Dot Stereograms of Dense
+                 Transparent Surfaces with Recurrent Back-Propagation",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "435--443",
+  year =         "1988",
+}
+
+@Article{quantiles-nc-2002,
+  author =       "Ichiro Takeuchi and Yoshua Bengio and Takafumi
+                 Kanamori",
+  title =        "Robust Regression with Asymmetric Heavy-Tail Noise Distributions",
+  journal =      "Neural Computation",
+  volume =       "14",
+  number =       "10",
+  pages =        "2469--2496",
+  year =         "2002",
+}
+
+@TechReport{quantiles-TR,
+  author =       "Ichiro Takeuchi and Yoshua Bengio and Takafumi
+                 Kanamori",
+  title =        "Robust Regression with Asymmetric Heavy-Tail Noise",
+  number =       "1198",
+  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
+  year =         "2001",
+}
+
+@Article{Quinlan+Rivest89,
+  author =       "J. Ross Quinlan and Ronald L. Rivest",
+  title =        "Inferring Decision Trees Using the Minimum Description
+                 Length Principle",
+  journal =      "Information and Computation",
+  volume =       "80",
+  pages =        "227--248",
+  year =         "1989",
+}
+
+@Article{Quinlan86,
+  author =       "J. Ross Quinlan",
+  title =        "Induction of Decision Trees",
+  journal =      "Machine Learning",
+  volume =       "1",
+  number =       "1",
+  pages =        "81--106",
+  year =         "1986",
+}
+
+@Book{Quinlan93,
+  author =       "J. Ross Quinlan",
+  title =        "{C4}.5: Programs for Machine Learning",
+  publisher =    "Morgan Kaufmann",
+  year =         "1993",
+}
+
+@Book{Rabiner+Gold75,
+  author =       "L. R. Rabiner and B. Gold",
+  title =        "Theory and application of digital signal processing",
+  publisher =    "Prentice-Hall",
+  year =         "1975",
+}
+
+@Article{Rabiner85,
+  author =       "L. R. Rabiner and S. E. Levinson",
+  title =        "A speaker-independent, syntax-directed, connected word
+                 recognition system based on hidden {Markov} models and
+                 level building",
+  journal =      ieeetassp,
+  volume =       "33",
+  number =       "3",
+  pages =        "561--573",
+  year =         "1985",
+}
+
+@Article{Rabiner86,
+  author =       "L. R. Rabiner and B. H. Juang",
+  title =        "An Introduction to Hidden {Markov} Models",
+  journal =      ieeeassp,
+  pages =        "257--285",
+  month =        "jan",
+  year =         "1986",
+}
+
+@Article{Rabiner89,
+  author =       "La. R. Rabiner",
+  title =        "A Tutorial on Hidden {Markov} Models and Selected
+                 Applications in Speech Recognition",
+  journal =      "Proceedings of the IEEE",
+  volume =       "77",
+  number =       "2",
+  pages =        "257--286",
+  year =         "1989",
+  OPTannote =    "",
+}
+
+@Article{Raetsch-2002,
+  author =       "Gunnar R{\"a}tsch and Ayhan Demiriz and Kristin P. Bennett",
+  title =        "Sparse Regression Ensembles in Infinite and Finite
+                 Hypothesis Spaces",
+  journal =      "Machine Learning",
+  publisher =    "Kluwer Academic Publishers",
+  year =         "2002",
+}
+
+@InCollection{Raftery1996,
+  author =       "A. Raftery",
+  editor =       "Gilks and al.",
+  booktitle =    "MCMC in Practice",
+  title =        "Hypothesis Testing and Model Selection",
+  publisher =    "Chapman and Hall",
+  pages =        "163--188",
+  year =         "1996",
+}
+
+
+@inproceedings{RaginskyM2008,
+  author    = {Maxim Raginsky and
+               Svetlana Lazebnik and
+               Rebecca Willett and
+               Jorge Silva},
+  title     = {Near-minimax recursive density estimation on the binary
+               hypercube},
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  year      = {2008},
+  pages     = {1305-1312},
+}
+
+@INPROCEEDINGS{RainaR2003,
+    author = {Rajat Raina and Yirong Shen and Andrew Y. Ng and Andrew McCallum},
+    title = {Classification with hybrid generative/discriminative models},
+    editor = NIPS16ed,
+    booktitle = NIPS16,
+    year = {2003},
+    publisher = {MIT Press}
+}
+
+@Misc{raina+ng+koller-workshop-2005,
+  author =       "Rajat Raina and Andrew Y. Ng and Daphne Koller",
+  title =        "Transfer Learning by Constructing Informative Priors",
+  howpublished = "'Inductive Transfer: 10 Years Later' NIPS Workshop",
+  year =         "2005",
+  OPTkey =       "",
+}
+
+@InProceedings{RainaR2007,
+  author =       "Rajat Raina and Alexis Battle and Honglak Lee and
+                 Benjamin Packer and Andrew Y. Ng",
+  booktitle =    ICML07,
+  editor =       ICML07ed,
+  publisher =    ICML07publ,
+  title =        "Self-taught learning: transfer learning from unlabeled
+                 data",
+  pages =        "759--766",
+  year =         "2007",
+  bibsource =    "DBLP, http://dblp.uni-trier.de",
+  ee =           "http://doi.acm.org/10.1145/1273496.1273592",
+}
+
+@InProceedings{RainaR2007-small,
+  author =       "R. Raina and A. Battle and H. Lee and B. Packer and A.
+                 Y. Ng",
+  booktitle =    "ICML 2007",
+  title =        "Self-taught learning: transfer learning from unlabeled
+                 data",
+  year =         "2007",
+}
+
+@inproceedings{RainaICML09,
+  author = {Raina, Rajat and Madhavan, Anand and Ng, Andrew Y.},
+  title = {Large-scale deep unsupervised learning using graphics processors},
+  booktitle = ICML09,
+  editor =  ICML09ed,
+  publisher = ICML09publ,
+  year = {2009},
+  isbn = {978-1-60558-516-1},
+  pages = {873--880},
+  location = {Montreal, Quebec, Canada},
+  doi = {http://doi.acm.org/10.1145/1553374.1553486},
+  address = {New York, NY, USA},
+}
+
+@InProceedings{Ramanujam88,
+  author =       "J. Ramanujam and P. Sadayappan",
+  booktitle =    icnn,
+  title =        "Optimization by Neural Networks",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "325--332",
+  year =         "1988",
+}
+
+@InProceedings{ranzato-07,
+  author =       "{Marc'Aurelio} Ranzato and Christopher Poultney and
+                 Sumit Chopra and Yann {LeCun}",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Efficient Learning of Sparse Representations with an
+                 Energy-Based Model",
+  publisher =    "MIT Press",
+  pages = {1137--1144},
+  year =         "2007",
+}
+
+@InProceedings{ranzato-07-small,
+  author =       "M. Ranzato and C. Poultney and
+                 S. Chopra and Y. {LeCun}",
+  booktitle =    "NIPS 19",
+  title =        "Efficient Learning of Sparse Representations with an
+                 Energy-Based Model",
+  year =         "2007",
+}
+
+@InProceedings{ranzato-07-short,
+  author =       "M. Ranzato and C. Poultney and
+                 S. Chopra and Y. {LeCun}",
+  booktitle =    "Adv. Neural Inf. Proc. Sys. 19",
+  title =        "Efficient Learning of Sparse Representations with an
+                 Energy-Based Model",
+  pages = {1137--1144},
+  year =         "2007",
+}
+
+# Please do NOT use this citation as it is a duplicate of ranzato-07
+@InCollection{ranzato-06,
+  author =       "{Marc'Aurelio} Ranzato and Christopher Poultney and
+                 Sumit Chopra and Yann {LeCun}",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Efficient Learning of Sparse Representations with an
+                 Energy-Based Model",
+  publisher =    "{MIT} Press",
+  pages =        "",
+  year =         "2007",
+}
+
+# Please do NOT use this citation as it is a duplicate of ranzato-07-small
+@InCollection{ranzato-06-small,
+  author =       "M. Ranzato and C. Poultney and
+                 S. Chopra and Y. {LeCun}",
+  booktitle =    "NIPS 19",
+  title =        "Efficient Learning of Sparse Representations with an
+                 Energy-Based Model",
+  year =         "2007",
+}
+
+
+@InProceedings{ranzato-08,
+  author =       "{Marc'Aurelio} Ranzato and Y-Lan Boureau and Yann
+                 {LeCun}",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Sparse feature learning for deep belief networks",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "1185--1192",
+  year =         "2008",
+}
+  %url =          "http://www.cs.nyu.edu/~ranzato/publications/ranzato-nips07.pdf",
+
+@InProceedings{ranzato-08-small,
+  author =       "M. Ranzato and Y. Boureau and Y. {LeCun}",
+  booktitle =    "NIPS 20",
+  title =        "Sparse feature learning for deep belief networks",
+  year =         "2008",
+}
+
+@InProceedings{ranzato-08-short,
+  author =       "M. Ranzato and Y. Boureau and Y. {LeCun}",
+  booktitle =    "Adv. Neural Inf. Proc. Sys. 20",
+  title =        "Sparse feature learning for deep belief networks",
+  year =         "2008",
+  pages = {1185--1192},
+}
+
+@InProceedings{ranzato-cvpr-07,
+  author =       "{Marc'Aurelio} Ranzato and {Fu-Jie} Huang and {Y-Lan}
+                 Boureau and Yann {LeCun}",
+  booktitle =    cvpr07,
+  title =        "Unsupervised Learning of Invariant Feature Hierarchies
+                 with Applications to Object Recognition",
+  publisher =    "IEEE Press",
+  year =         "2007",
+  original =     "orig/ranzato-cvpr-07.pdf",
+}
+
+@InProceedings{ranzato-cvpr-07-small,
+  author =       "{Marc'Aurelio} Ranzato and {Fu-Jie} Huang and {Y-Lan}
+                 Boureau and Yann {LeCun}",
+  booktitle =    "CVPR'07",
+  title =        "Unsupervised Learning of Invariant Feature Hierarchies
+                 with Applications to Object Recognition",
+  year =         "2007",
+  original =     "orig/ranzato-cvpr-07.pdf",
+}
+
+@InProceedings{Ranzato-icdar07,
+  author =       "{Marc'Aurelio} Ranzato and Yann {LeCun}",
+  booktitle =    ICDAR07,
+  title =        "A Sparse and Locally Shift Invariant Feature Extractor
+                 Applied to Document Images",
+  year =         "2007",
+  isbn =         {0-7695-2822-8},
+  pages =        {1213--1217},
+  publisher =    {IEEE Computer Society},
+  address =      {Washington, DC, USA},
+
+}
+
+@InProceedings{ranzato-unsup-07,
+  author =       "{Marc'Aurelio} Ranzato and {Y-Lan} Boureau and Sumit
+                 Chopra and Yann {LeCun}",
+  booktitle =    aistats07,
+  title =        "A Unified Energy-Based Framework for Unsupervised
+                 Learning",
+  publisher =    "Omnipress",
+  date =         "March 21-24, 2007",
+  address =      "San Juan, Porto Rico",
+  year =         "2007",
+}
+
+@InProceedings{Rao+Ruderman-99,
+  author =       "R. P. N. Rao and D. L. Ruderman",
+  editor =       NIPS11ed,
+  booktitle =    NIPS11,
+  title =        "Learning {Lie} Groups for Invariant Visual
+                 Perception",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "810--816",
+  year =         "1999",
+}
+
+@Book{Rao71,
+  author =       "C. R. Rao and S. K. Mitra",
+  title =        "Generalized Inverse of Matrices and Its Applications",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1971",
+}
+
+@Book{Rashevsky38,
+  author =       "N. Rashevsky",
+  title =        "Mathematical Biophysics",
+  publisher =    "University of Chicago Press",
+  address =      "Chicago",
+  year =         "1938",
+}
+
+@InProceedings{RasmussenC2000,
+  author =       "Carl Rasmussen",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "The Infinite {G}aussian Mixture Model",
+  year =         "2000",
+}
+
+@Misc{Rasmussen2001,
+  author =       "Carl Edward Rasmussen",
+  title =        "Conjugate gradient for Matlab",
+  year =         "2001",
+  note =         "http://www.kyb.tuebingen.mpg.de/bs/people/carl/code/minimize/",
+}
+
+@Article{Ratnaparkhi99,
+  author =       "A. Ratnaparkhi",
+  title =        "Learning to parse natural language with maximum
+                 entropy models",
+  journal =      "Machine Learning",
+  volume =       "341",
+  number =       "2",
+  pages =        "151--176",
+  year =         "1999",
+}
+
+@Article{Rauch63,
+  author =       "H. E. Rauch",
+  title =        "Solutions to the linear smoothing problem",
+  journal =      "IEEE Transactions on Automatic Control",
+  volume =       "8",
+  pages =        "371--372",
+  year =         "1963",
+}
+
+@Article{Refenes-94,
+  author =       "A. N. Refenes",
+  title =        "Stock Performance Modeling Using Neural Networks: a
+                 Comparative Study with Regression Models",
+  journal =      "Neural Networks",
+  volume =       "7",
+  number =       "2",
+  pages =        "375--388",
+  year =         "1994",
+}
+
+@Article{regression-KB-78,
+  author =       "R. Koenker and G. Bassett Jr.",
+  title =        "Regression Quantiles",
+  journal =      "Econometrica",
+  volume =       "46",
+  number =       "1",
+  pages =        "33--50",
+  year =         "1978",
+}
+@inproceedings{reid:1989,
+    title = {Rapid Training of Higher-Order Neural Networks for Invariant Pattern
+        Recognition},
+    author = {Reid, M. B. and  Spirkovska, L. and  Ochoa, E  },
+    booktitle = ijcnn,
+    month   = {June},
+    year    = {1989},
+    address = {Washington, DC, USA},
+}
+
+@InCollection{Rescorla72,
+  author =       "R. A. Rescorla and A. R. Wagner",
+  editor =       "A. H. Black and W. F. Prokasy",
+  booktitle =    "Classical Conditioning II: Current Research and
+                 Theory",
+  title =        "A Theory of Pavlovian Conditioning: The Effectiveness
+                 of Reinforcement and Nonreinforcement",
+  publisher =    "Appleton-Century-Crofts",
+  address =      "New York",
+  pages =        "64--69",
+  year =         "1972",
+}
+
+@InProceedings{Resnik-2002,
+  author =       "Mona Diab and Philip Resnik",
+  booktitle =    "40th Annual Meeting of the {ACL}",
+  title =        "An unsupervised method for word sense tagging using
+                 parallel corpora",
+  year =         "2002",
+}
+
+@Article{Resnik-99,
+  author =       "Philip Resnik",
+  title =        "Semantic similarity in a taxonomy: an
+                 information-based measure and its application to
+                 problems of ambiguity in natural language",
+  journal =      "Journal of Artificial Intelligence Research",
+  volume =       "11",
+  pages =        "95--130",
+  year =         "1999",
+}
+
+@InProceedings{Resnik-99-web,
+  author =       "P. Resnik",
+  booktitle =    "37th Annual Meeting of the Association for
+                 Computational Linguistics (ACL'99)",
+  title =        "Mining the Web for Bilingual Text",
+  address =      "College Park, Maryland",
+  month =        jun,
+  year =         "1999",
+}
+
+@article{Rhodes-2008,
+ author = {Paul Rhodes},
+ title = {Recoding Patterns of Sensory Input: Higher-Order Features and the Function of Nonlinear Dendritic Trees},
+ journal = {Neural Computation},
+ volume = 20,
+ number=8,
+ pages = {2000--2036},
+ year = 2008,
+}
+
+@Article{RicLip91,
+  author =       "Michael D. Richard and Richard P. Lippmann",
+  title =        "Neural Network Classifiers Estimate {Bayesian}
+                 a-posteriori Probabilities",
+  journal =      "Neural Computation",
+  volume =       "3",
+  pages =        "461--483",
+  year =         "1991",
+  abstract =     "Theoretical argumentation under which circumstances
+                 nets can estimate correctly and what this means for
+                 network engineering methodology. Experimental
+                 evaluations with different cost functions (mean squared
+                 error, cross entropy, normalized likelihood) and
+                 network types (multi layer perceptron, radial basis
+                 function, high order polynomial) show how accuracy
+                 degrades with insufficient data or inadequate network
+                 size. Dicusses practical consequences. Contains
+                 references to work on other cost functions (e.g.
+                 information measures)",
+  class =        "nn, learning, theory",
+}
+
+@InProceedings{Ricotti88,
+  author =       "L. P. Ricotti and S. Ragazzini and G. Martinelli",
+  booktitle =    icnn,
+  title =        "Learning of Word Stress in a Sub-Optimal Second Order
+                 Back-Propagation Neural Network",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "355--361",
+  year =         "1988",
+}
+
+@Article{Riedel88,
+  author =       "U. Riedel and R. K{\"u}hn and J. L. van Hemmen",
+  title =        "Temporal Sequences and Chaos in Neural Nets",
+  journal =      prA,
+  volume =       "38",
+  pages =        "1105--1108",
+  year =         "1988",
+}
+
+@Article{Riis96,
+  author =       "S. K. Riis and A. Krogh",
+  title =        "Improving prediction of protein secondary structure
+                 using structured neural networks and multiple sequence
+                 alignments",
+  journal =      "J. Comput. Biol.",
+  volume =       "3",
+  pages =        "163--183",
+  year =         "1996",
+}
+
+@Article{RiisKrogh1996,
+  author =       "S. Riis and A. Krogh",
+  title =        "Improving protein secondary structure prediction using
+                 structured neural networks and multiple sequence
+                 profiles",
+  journal =      "Journal of Computational Biology",
+  pages =        "163--183",
+  year =         "1996",
+}
+
+@TechReport{Riley94,
+  author =       "M. D. Riley and F. C. N. Pereira",
+  title =        "Weighted-finite-automata tools with applications to
+                 speech and language processing",
+  number =       "Technical Memorandum 11222-931130-28TM",
+  institution =  "AT\&T Bell Laboratories",
+  year =         "1994",
+}
+
+@article{Rissanen79, 
+ author = {J.J. Rissanen and G.G. Langdon Jr.},
+ title = {Arithmetic coding},
+ journal = {IBM Journal of Research and Development},
+ volume = 23, 
+ number = 2,
+ pages = {149--162},
+ year = 1979,
+}
+
+@Article{rissanen83,
+  author =       "J.J. Rissanen",
+  title =        "A universal data compression system",
+  journal =      "IEEE Transactions on Information Theory",
+  volume =       "29",
+  pages =        "656--664",
+  year =         "1983",
+}
+
+@Article{Rissanen86,
+  author =       "J. Rissanen",
+  title =        "Stochastic complexity and modeling",
+  journal =      "Annals of Statistics",
+  volume =       "14",
+  pages =        "1080--1100",
+  year =         "1986",
+}
+
+@Book{RissanenBook,
+  author =       "J. Rissanen",
+  title =        "Stochastic Complexity in Statistical Inquiry",
+  publisher =    "World Scientific",
+  address =      "Singapore",
+  year =         "1990",
+}
+
+@Article{Ritter86,
+  author =       "H. Ritter and K. Schulten",
+  title =        "On the Stationary State of Kohonen's Self-Organizing
+                 Sensory Mapping",
+  journal =      biocyb,
+  volume =       "54",
+  pages =        "99--106",
+  year =         "1986",
+}
+
+@InProceedings{Ritter88a,
+  author =       "H. Ritter and K. Schulten",
+  editor =       "R. Eckmiller and Ch. von der Malsburg",
+  booktitle =    "Neural Computers",
+  title =        "Extending Kohonen's Self-Organizing Mapping Algorithm
+                 to Learn Ballistic Movements",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Neuss 1987",
+  pages =        "393--406",
+  year =         "1988",
+}
+
+@Article{Ritter88b,
+  author =       "H. Ritter and K. Schulten",
+  title =        "Convergence Properties of Kohonen's Topology
+                 Conserving Maps: Fluctuations, Stability, and Dimension
+                 Selection",
+  journal =      biocyb,
+  volume =       "60",
+  pages =        "59--71",
+  year =         "1988",
+}
+
+@InProceedings{Ritter88c,
+  author =       "H. Ritter and K. Schulten",
+  booktitle =    icnn,
+  title =        "Kohonen's Self-Organizing Maps: Exploring Their
+                 Computational Capabilities",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "109--116",
+  year =         "1988",
+}
+
+@Book{Robert-1999,
+  author =       "Christian P. Robert and George Casella",
+  title =        "Monte Carlo Statistical Methods",
+  publisher =    "Springer",
+  year =         "1999",
+}
+
+@TechReport{Robinson+Fallside90,
+  author =       "A. J. Robinson and F. Fallside",
+  key =          "Robinson",
+  title =        "Phoneme recognition from the {TIMIT} database using
+                 recurrent error propagation networks",
+  type =         "Technical Report",
+  number =       "{CUED/F-INFENG/TR.42}",
+  institution =  "Cambridge University Engineering Department",
+  year =         "1990",
+}
+
+@Article{Robinson+Fallside91,
+  author =       "A. J. Robinson and F. Fallside",
+  title =        "A recurrent error propagation network speech
+                 recognition system",
+  journal =      "Computer Speech and Language",
+  volume =       "5",
+  number =       "3",
+  pages =        "259--274",
+  month =        jul,
+  year =         "1991",
+}
+
+@InProceedings{Robinson88,
+  author =       "A. J. Robinson and F. Fallside",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Static and Dynamic Error Propagation Networks with
+                 Application to Speech Coding",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "632--641",
+  year =         "1988",
+}
+
+@Article{Robinson91,
+  author =       "T. Robinson and F. Fallside",
+  title =        "Recurrent Error Propagation Network Speech Recognition
+                 System",
+  journal =      cspla,
+  volume =       "5",
+  number =       "3",
+  pages =        "259--274",
+  month =        jul,
+  year =         "1991",
+}
+
+@InProceedings{Robinson92-icassp,
+  author =       "T. Robinson",
+  booktitle =    icassp,
+  title =        "A Real-Time Recurrent Error Propagation Network Word
+                 Recognition System",
+  volume =       "I",
+  pages =        "617--620",
+  year =         "1992",
+}
+
+@Article{robust-H-73,
+  author =       "P. J. Huber",
+  title =        "Robust regression: Asymptotics, Conjectures and
+                 {Monte} {Carlo}",
+  journal =      "Ann. Stat.",
+  volume =       "1",
+  pages =        "799--821",
+  year =         "1973",
+}
+
+@Book{robust-H-82,
+  author =       "P. J. Huber",
+  title =        "Robust Statistics",
+  publisher =    "John Wiley \& Sons Inc.",
+  year =         "1982",
+}
+
+@Book{robust-HRRS-86,
+  author =       "F. R. Hampel and E. M. Ronchetti and P. J. Rousseeuw
+                 and W. A. Stahel",
+  title =        "Robust Statistics, The Approach based on Influence
+                 Functions",
+  publisher =    "John Wiley \& Sons",
+  year =         "1986",
+}
+
+@TechReport{robust-RAD-00,
+  author =       "P. J. Rousseeuw and S. V. Aelst and K. V. Driessen",
+  title =        "Robust Multivariate Regression",
+  institution =  "University of Antwerp",
+  year =         "2000",
+}
+
+@Book{robust-RL-87,
+  author =       "P. J. Rousseeuw and A. M. Leroy",
+  title =        "Robust Regression and Outlier Detection",
+  publisher =    "John Wiley \& Sons Inc.",
+  year =         "1987",
+}
+
+@InProceedings{Rohwer-nips90,
+  author =       "R. Rohwer",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "The `Moving Targets' Training Algorithm",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "558--565",
+  year =         "1990",
+}
+
+@InProceedings{Rohwer87,
+  author =       "R. Rohwer and B. Forrest",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Training Time-Dependence in Neural Networks",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "701--708",
+  year =         "1987",
+}
+
+@InProceedings{Rohwer90,
+  author =       "R. Rohwer",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "The ``Moving Targets'' Training Algorithm",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "558--565",
+  year =         "1990",
+}
+
+@article{Rohde+Plaut-99,
+ author = {D.L.T. Rohde and D.C. Plaut},
+ title = {Language acquisition in the absence of explicit negative evidence: {H}ow important is starting small?},
+ journal = {Cognition}, 
+ volume = 72,
+ pages = {67--109},
+ year = 1999
+}
+
+@PhdThesis{Romeo89,
+  author =       "F. I. Romeo",
+  title =        "Simulated Annealing: Theory and Applications to Layout
+                 Problems",
+  school =       "University of California at Berkeley",
+  year =         "1989",
+  note =         "Memorandum UCB/ERL--M89/29",
+}
+
+@InProceedings{Romer+Frey2003,
+  author =       "R. Rosales and B. Frey",
+  booktitle =    UAI03,
+  title =        "Learning Generative Models of Affinity Matrices",
+  publisher =    "Morgan Kaufmann Publishers",
+  address =      "San Francisco, CA",
+  pages =        "485--492",
+  year =         "2003",
+}
+
+@InProceedings{Ron94,
+  author =       "D. Ron and Y. Singer and N. Tishby",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "The power of amnesia",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "176--183",
+  year =         "1994",
+}
+
+@Article{Ron96,
+  author =       "D. Ron and Y. Singer and N. Tishby",
+  title =        "The power of amnesia: Learning Probabilistic Automata
+                 with Variable Memory Length",
+  journal =      "Machine Learning",
+  volume =       "25",
+  year =         "1996",
+}
+
+@Article{Ron98,
+  author =       "Naftali Tishby {Dana Ron, Yoram Singer}",
+  title =        "On the Learnability and Usage of Acyclic Probabilistic
+                 Finite Automata",
+  journal =      "Journal of Computer and System Sciences",
+  volume =       "56",
+  number =       "2",
+  pages =        "133--152",
+  year =         "1998",
+}
+
+@InProceedings{Roscheisen-nips92,
+  author =       "M. Rvscheisen and R. Hofman and V. Tresp",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Neural Control for Rolling Mills: Incorporating Domain
+                 Theories to Overcome Data Deficiency",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "659--666",
+  year =         "1992",
+}
+
+@Book{Rose85,
+  editor =       "D. Rose and V. G. Dobson",
+  title =        "Models of the Visual Cortex",
+  publisher =    "Wiley",
+  address =      "Chichester",
+  year =         "1985",
+}
+
+@Book{Rosenberg-1997,
+  author =       "S. Rosenberg",
+  title =        "The Laplacian on a Riemannian Manifold",
+  publisher =    "Cambridge University Press",
+  address =      "Cambridge, UK",
+  year =         "1997",
+}
+
+@InCollection{Rosenberg88,
+  author =       "C. R. Rosenberg and G. Blelloch",
+  editor =       "D. Waltz and J. Feldman",
+  booktitle =    "Connectionist Models and their Implications",
+  title =        "An Implementation of Network Learning on the
+                 Connection Machine",
+  publisher =    "Ablex Pub. Corp",
+  address =      "Norwood, NJ",
+  year =         "1988",
+}
+
+@TechReport{Rosenblatt57,
+  author =       "Frank Rosenblatt",
+  title =        "The Perceptron --- a perceiving and recognizing
+                 automaton",
+  number =       "85-460-1",
+  institution =  "Cornell Aeronautical Laboratory",
+  address =      "Ithaca, N.Y.",
+  year =         "1957",
+}
+
+@article{Rosenblatt-1958,
+    author = {Frank Rosenblatt},
+    title = {The perceptron: A probabilistic model for information storage and organization in the brain},
+    journal = {Psychological Review},
+    year = {1958},
+    volume = {65},
+    pages = {386–408},
+}
+
+@Book{Rosenblatt62,
+  author =       "Frank Rosenblatt",
+  title =        "Principles of Neurodynamics",
+  publisher =    "Spartan",
+  address =      "New York",
+  year =         "1962",
+}
+
+@Article{rosenfeld02whole,
+  author =       "Ronald Rosenfeld and Stanley F. Chen and Xiaojin Zhu",
+  title =        "Whole-Sentence Exponential Language Models: {A}
+                 Vehicle For Linguistic-Statistical Integration",
+  journal =      CSL,
+  volume =       "15",
+  number =       "1",
+  year =         "2001",
+  URL =          "citeseer.nj.nec.com/448532.html",
+}
+
+@Article{Rosenfeld2000,
+  author =       "Ronald Rosenfeld",
+  title =        "Two decades of Statistical Language Modeling: Where Do
+                 We Go From Here?",
+  journal =      "Proceedings of the {IEEE}",
+  volume =       "88",
+  number =       "8",
+  pages =        "1270--1278",
+  year =         "2000",
+}
+
+@InProceedings{Rosipal2003,
+  author =       "R. Rosipal and L. J. Trejo and B. Matthews",
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  title =        "Kernel {PLS}-{SVC} for Linear and Nonlinear
+                 Classification",
+  year =         "2003",
+}
+
+@PhdThesis{Rossen89,
+  author =       "M. L. Rossen",
+  title =        "Speech Syllable Recognition with a Neural Network",
+  school =       "Brown University",
+  year =         "1989",
+}
+
+@Article{Rost93,
+  author =       "B. Rost and C. Sander",
+  title =        "Improved prediction of protein secondary structure by
+                 use of sequence profiles and neural networks",
+  journal =      "Proc. Nat. Ac. Sci. USA",
+  volume =       "90",
+  pages =        "7558--7562",
+  year =         "1993",
+}
+
+@Article{Rost94,
+  author =       "B. Rost and C. Sander",
+  title =        "Combining evolutionary information and neural networks
+                 to predict protein secondary structure",
+  journal =      "Proteins",
+  volume =       "19",
+  pages =        "55--72",
+  year =         "1994",
+}
+
+@InProceedings{RothBlack2005,
+  author =       "Stefan Roth and Michael J. Black",
+  booktitle =    cvpr05,
+  title =        "Fields of Experts: a framework for learning image
+                 priors",
+  volume =       "2",
+  number =       "",
+  pages =        "860--867",
+  year =         "2005",
+}
+
+@InProceedings{Roweis+Saul+Hinton-2002,
+  author =       "S. Roweis and L. Saul and G. Hinton",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Global coordination of local linear models",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+}
+
+% DEPRECATED, USE THE ONE BELOW
+@Article{roweis00lle,
+  author =       "Sam Roweis and Lawrence K. Saul",
+  title =        "Nonlinear dimensionality reduction by locally linear
+                 embedding",
+  journal =      "Science",
+  volume =       "290",
+  number =       "5500",
+  pages =        "2323--2326",
+  month =        dec,
+  year =         "2000",
+}
+
+@Article{Roweis2000-lle,
+  author =       "Sam Roweis and Lawrence K. Saul",
+  title =        "Nonlinear dimensionality reduction by locally linear
+                 embedding",
+  journal =      "Science",
+  volume =       "290",
+  number =       "5500",
+  pages =        "2323--2326",
+  month =        dec,
+  year =         "2000",
+}
+
+@TechReport{roweis97unifying,
+  author =       "Sam Roweis and Zoubin Ghahramani",
+  title =        "A Unifying Review of Linear {G}aussian Models",
+  address =      "6 King's College Road, Toronto M5S 3H5, Canada",
+  year =         "1997",
+  URL =          "citeseer.nj.nec.com/article/roweis97unifying.html",
+}
+
+@InProceedings{roweis98em,
+  author =       "Sam Roweis",
+  editor =       NIPS10ed,
+  booktitle =    NIPS10,
+  title =        "{EM} Algorithms for {PCA} and {SPCA}",
+  volume =       "10",
+  publisher =    "{MIT} Press",
+  year =         "1998",
+  URL =          "citeseer.nj.nec.com/roweis98em.html",
+}
+
+@InProceedings{RoweisNCA2005,
+  author =       "Jacob Goldberger and Sam Roweis and Geoffrey E. Hinton and Ruslan
+                 Salakhutdinov",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Neighbourhood Components Analysis",
+  publisher =    "{MIT} Press",
+  year =         "2005",
+}
+
+@Book{Rubinstein1981,
+  author =       "Reuven Y. Rubinstein",
+  title =        "Simulation and the Monte Carlo Method",
+  publisher =    "John Wiley \& Sons",
+  year =         "1981",
+}
+
+@Article{Rubner89,
+  author =       "J. Rubner and P. Tavan",
+  title =        "A Self-Organizing Network for Principal-Component
+                 Analysis",
+  journal =      eul,
+  volume =       "10",
+  pages =        "693--698",
+  year =         "1989",
+}
+
+
+@Article{Rubner90,
+  author =       "J. Rubner and K. Schulten",
+  title =        "Development of Feature Detectors by
+                 Self-Organization",
+  journal =      biocyb,
+  volume =       "62",
+  pages =        "193--199",
+  year =         "1990",
+}
+
+@Article{Rumelhart85,
+  author =       "D. E. Rumelhart and D. Zipser",
+  title =        "Feature Discovery by Competitive Learning",
+  journal =      cogsci,
+  volume =       "9",
+  pages =        "75--112",
+  year =         "1985",
+  note =         "Reprinted in \cite[chapter 5]{Rumelhart86a}",
+}
+
+@Book{Rumelhart86a,
+  author =       "D. E. Rumelhart and J. L. McClelland and the PDP
+                 Research Group",
+  title =        "Parallel Distributed Processing: Explorations in the
+                 Microstructure of Cognition",
+  volume =       "1",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1986",
+}
+
+@Article{Rumelhart86b,
+  author =       "David E. Rumelhart and Geoffrey E. Hinton and Ronald J. Williams",
+  title =        "Learning Representations by Back-Propagating Errors",
+  journal =      "Nature",
+  volume =       "323",
+  pages =        "533--536",
+  year =         "1986",
+}
+
+@InCollection{Rumelhart86c,
+  author =       "D. E. Rumelhart and G. E. Hinton and R. J. Williams",
+  editor =       "D. E. Rumelhart and J. L. McClelland",
+  booktitle =    pdp,
+  title =        "Learning Internal Representations by Error
+                 Propagation",
+  chapter =      "8",
+  volume =       "1",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  pages =        "318--362",
+  year =         "1986",
+}
+
+@InProceedings{Russ+Geoff-nips-2007,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  editor =       NIPS20ed,
+  booktitle =    NIPS20,
+  title =        "Using Deep Belief Nets to Learn Covariance Kernels for
+                 {Gaussian} Processes",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "1249--1256",
+  year =         "2008",
+}
+  %url =          "http://www.csri.utoronto.ca/~hinton/absps/dbngp.pdf",
+
+@InProceedings{Russ+Geoff-nips-2007-small,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  booktitle =    "NIPS 20",
+  title =        "Using {D}eep {B}elief {N}ets to Learn Covariance Kernels for
+                 {G}aussian Processes",
+  year =         "2008",
+}
+
+@InProceedings{Russ+Geoff-nips-2007-short,
+  author =       "R. Salakhutdinov and G.E. Hinton",
+  booktitle =    "Adv. Neural Inf. Proc. Sys. 20",
+  title =        "Using {D}eep {B}elief {N}ets to Learn Covariance Kernels for
+                 {G}aussian Processes",
+  pages = {1249--1256},
+  year =         "2008",
+}
+
+@article{rust:2005,
+    author      = {Nicole Rust and Odelia Schwartz and J. Anthony Movshon and Eero Simoncelli},
+    title       = {Spatiotemporal Elements of Macaque {V1} Receptive Fields},
+    journal     = {Neuron},
+    volume      = {46},
+    number      = {6},
+    pages       = {945-956},
+    year        = {2005}
+}
+@article{rust:2006,
+    author = {Nicole C. Rust and Valerio Mante and Eero P. Simoncelli and J.
+        Anthony Movshon},
+    year = {2006},
+    title = {How MT Cells Analyze the Motion of Visual Patterns},
+    journal = {Nature Neuroscience},
+    volume = {9},
+    number = {11},
+    pages = {1421-1431},
+}
+
+@Article{RYsed98,
+  author =       "Eric Sven Ristad and Peter N. Yianilos",
+  title =        "Learning String Edit Distance",
+  journal =      "IEEE Transactions on Pattern Recognition and Machine
+                 Intelligence",
+  month =        may,
+  year =         "1998",
+}
+
+@Book{Saad-1996,
+  author =       "Y. Saad",
+  title =        "Iterative Methods for Sparse Linear Systems",
+  publisher =    "{PWS} Publishing Company",
+  address =      "Boston, MA",
+  year =         "1996",
+}
+
+@TechReport{Saad90a,
+  author =       "D. Saad and E. Marom",
+  title =        "Learning by Choice of Internal Representations --- An
+                 Energy Minimization Approach",
+  type =         "Preprint",
+  institution =  "Faculty of Engineering, Tel Aviv University",
+  address =      "Ramat-Aviv, Israel",
+  year =         "1990",
+}
+
+@TechReport{Saad90b,
+  author =       "D. Saad and E. Marom",
+  title =        "Training Feed Forward Nets with Binary Weights via a
+                 Modified {CHIR} Algorithm",
+  type =         "Preprint",
+  institution =  "Faculty of Engineering, Tel Aviv University",
+  address =      "Ramat-Aviv, Israel",
+  year =         "1990",
+}
+
+@Book{SaadOnlineLearning1999,
+  editor =       "David Saad",
+  title =        "On-Line Learning in Neural Networks",
+  publisher =    "Cambridge University Press",
+  year =         "1999",
+}
+
+@Article{Sachs+Young80,
+  author =       "M. B. Sachs and E. D. Young",
+  title =        "Effects of nonlinearities on speech encoding in the
+                 auditory nerve",
+  journal =      jasa,
+  volume =       "68",
+  number =       "3",
+  pages =        "858--875",
+  year =         "1980",
+}
+
+@Article{Sakoe78,
+  author =       "H. Sakoe and C. Chiba",
+  title =        "Dynamic Programming Algorithm Optimization for Spoken
+                 Word Recognition",
+  journal =      ieeetassp,
+  volume =       "26",
+  number =       "1",
+  pages =        "43--49",
+  month =        feb,
+  year =         "1978",
+}
+
+@InProceedings{Salakhutdinov-2010,
+    author = {Ruslan Salakhutdinov},
+     title = {Learning in {M}arkov Random Fields using Tempered Transitions},
+      year = {2010},
+  crossref = {NIPS22}
+}
+
+@InProceedings{Salakhutdinov+Hinton2007,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  booktitle =    "Proceedings of the 2007 Workshop on Information
+                 Retrieval and applications of Graphical Models (SIGIR
+                 2007)",
+  title =        "Semantic Hashing",
+  year =         "2007",
+  publisher  =   "Elsevier",
+  address = {Amsterdam},
+}
+
+@InProceedings{Salakhutdinov+Hinton2007-small,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  booktitle =    "SIGIR",
+  title =        "Semantic Hashing",
+  year =         "2007",
+}
+
+@InProceedings{SalakhutdinovR2007,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  booktitle =    aistats07,
+  title =        "Learning a Nonlinear Embedding by Preserving Class
+                 Neighbourhood Structure",
+  publisher =    "Omnipress",
+  date =         "March 21-24, 2007",
+  address =      "San Juan, Porto Rico",
+  year =         "2007",
+}
+
+@InProceedings{SalakhutdinovR2007-small,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  booktitle =    aistats07-small,
+  title =        "Learning a Nonlinear Embedding by Preserving Class
+                 Neighbourhood Structure",
+  year =         "2007",
+}
+
+@InProceedings{SalakhutdinovR2007-short,
+  author =       "R. Salakhutdinov and G.E. Hinton",
+  booktitle =    {AI \& Stat.'2007},
+  title =        "Learning a Nonlinear Embedding by Preserving Class
+                 Neighbourhood Structure",
+  year =         "2007",
+}
+
+@InProceedings{SalakhutdinovR2007b,
+  author =       "Ruslan Salakhutdinov and Andriy Mnih and Geoffrey E.
+                 Hinton",
+  booktitle =    ICML07,
+  editor =       ICML07ed,
+  publisher =    ICML07publ,
+  title =        "Restricted {Boltzmann} machines for collaborative
+                 filtering",
+  address =      "New York, NY, USA",
+  pages =        "791--798",
+  year =         "2007",
+  location =     "Corvalis, Oregon",
+}
+
+@InProceedings{SalakhutdinovR2007b-small,
+  author =       "Ruslan Salakhutdinov and Andriy Mnih and Geoffrey E. Hinton",
+  booktitle =    "ICML 2007",
+  title =        "Restricted {Boltzmann} machines for collaborative
+                 filtering",
+  year =         "2007",
+}
+
+@InProceedings{SalakhutdinovR2007b-short,
+  author =       "R. Salakhutdinov and A. Mnih and G.E. Hinton",
+  booktitle =    "Int. Conf. Mach. Learn. 2007",
+  title =        "Restricted {Boltzmann} machines for collaborative
+                 filtering",
+  pages =        "791--798",
+  year =         "2007",
+}
+
+
+@InProceedings{Salakhutdinov+Murray-2008,
+    title =     "On the Quantitative Analysis of Deep Belief Networks",
+    author =    "Ruslan Salakhutdinov and Iain Murray",
+    booktitle = ICML08,
+    editor =    ICML08ed,
+    publisher = ICML08publ,
+    pages =     "872--879",
+    year =      "2008",
+    volume =    "25",
+}
+
+@InProceedings{Salakhutdinov+Hinton-2009,
+  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
+  booktitle =    aistats09,
+  title =        "Deep {Boltzmann} Machines",
+  year =         "2009",
+  volume =       5,
+  location =     "Clearwater (Florida), USA",
+  date =         "April 16-18, 2009",
+  pages =        "448--455",
+}
+
+@Article{Salamon88,
+  author =       "P. Salamon and J. D. Nulton and J. Robinson and J.
+                 Petersen and G. Ruppeiner and L. Liao",
+  title =        "Simulated Annealing with Constant Thermodynamic
+                 Speed",
+  journal =      cpc,
+  volume =       "49",
+  pages =        "423--428",
+  year =         "1988",
+}
+
+@Article{Salton+Buckley88,
+  author =       "G. Salton and C. Buckley",
+  title =        "Term weighting approaches in automatic text
+                 retrieval",
+  journal =      "Information Processing and Management",
+  volume =       "24",
+  number =       "5",
+  pages =        "513--523",
+  year =         "1988",
+}
+
+@Article{Sanger89a,
+  author =       "T. D. Sanger",
+  title =        "Optimal Unsupervised Learning in a Single-Layer Linear
+                 Feedforward Neural Network",
+  journal =      nn,
+  volume =       "2",
+  pages =        "459--473",
+  year =         "1989",
+}
+
+@InProceedings{Sanger89b,
+  author =       "T. D. Sanger",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "An Optimality Principle for Unsupervised Learning",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "11--19",
+  year =         "1989",
+}
+
+@article{Sanger-1994,
+ author = {Terence D. Sanger},
+ title = {Neural network learning control of robot manipulators 
+      using gradually increasing task difficulty},
+ journal = {{IEEE} Transactions on Robotics and Automation},
+ volume = 10,
+ number = 3,
+ year = 1994,
+}
+
+@article{Sanger-1994-small,
+ author = {Terence D. Sanger},
+ title = {Neural network learning control of robot manipulators 
+      using gradually increasing task difficulty},
+ journal = {{IEEE} Trans. on Robotics and Automation},
+ volume = 10,
+ number = 3,
+ year = 1994,
+}
+
+@InProceedings{sarawagi03,
+  author =       "Sunita Sarawagi and Soumen Chakrabarti and Shantanu
+                 Godbole",
+  booktitle =    "KDD '03: Proceedings of the ninth ACM SIGKDD
+                 international conference on Knowledge discovery and
+                 data mining",
+  title =        "Cross-training: learning probabilistic mappings
+                 between topics",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "177--186",
+  year =         "2003",
+  location =     "Washington, D.C.",
+}
+
+@article{Sarkar-Moore-2005,
+ author = {P. Sarkar and A. Moore},
+ title = {Dynamic social network analysis using latent space models},
+ journal = {{SIGKDD} Explorations},
+ volume = 7,
+ number = 2,
+ pages = {31--40},
+ year = 2005,
+}
+
+@Article{Sato90,
+  author =       "M. Sato",
+  title =        "A Real Time Learning Algorithm for Recurrent Analog
+                 Neural Networks",
+  journal =      biocyb,
+  volume =       "62",
+  pages =        "237--241",
+  year =         "1990",
+}
+
+@Article{Saul+96,
+  author =       "Lawrence K. Saul and Tommi Jaakkola and Michael I. Jordan",
+  title =        "Mean field theory for sigmoid belief networks",
+  journal =      "Journal of Artificial Intelligence Research",
+  volume =       "4",
+  pages =        "61--76",
+  year =         "1996",
+}
+
+@Article{Saul+Roweis-2002,
+  author =       "L. Saul and S. Roweis",
+  title =        "Think globally, fit locally: unsupervised learning of
+                 low dimensional manifolds",
+  journal =      jmlr,
+  volume =       "4",
+  number =       "",
+  pages =        "119--155",
+  month =        "",
+  year =         "2002",
+}
+
+@InProceedings{Saul95,
+  author =       "Lawrence K. Saul and Michael I. Jordan",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        {Boltzmann Chains and Hidden Markov Models},
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "435--442",
+  year =         "1995",
+}
+
+@InProceedings{Saul96,
+  author =       "Lawrence K. Saul and Michael I. Jordan",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Exploiting tractable substructures in intractable
+                 networks",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@InProceedings{SaulJordan97,
+  author =       "Lawrence K. Saul and Michael I. Jordan",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "A variational model for model-based interpolation",
+  publisher =    "MIT Press",
+  pages =        "375",
+  year =         "1997",
+}
+
+@Article{Saund-1989,
+  author =       "Eric Saund",
+  title =        "Dimensionality-reduction using connectionist
+                 networks",
+  journal =      "{IEEE} Transactions on Pattern Analysis and Machine
+                 Intelligence",
+  volume =       "11",
+  number =       "3",
+  pages =        "304--314",
+  year =         "1989",
+}
+
+@InCollection{Scalettar88,
+  author =       "R. Scalettar and A. Zee",
+  editor =       "D. Waltz and J. A. Feldman",
+  booktitle =    "Connectionist Models and Their Implications: Readings
+                 from Cognitive Science",
+  title =        "Emergence of Grandmother Memory in Feed Forward
+                 Networks: Learning with Noise and Forgetfulness",
+  publisher =    "Ablex",
+  address =      "Norwood",
+  pages =        "309--332",
+  year =         "1988",
+}
+
+@Article{schapire-90,
+  author =       "Robert E. Schapire",
+  title =        "The strength of weak learnability",
+  journal =      "Machine Learning",
+  volume =       "5",
+  number =       "2",
+  pages =        "197--227",
+  year =         "1990",
+}
+
+@Article{Schapire-margin98,
+  author =       "Robert E. Schapire and Yoav Freund and Peter Bartlett
+                 and Wee Sun Lee",
+  title =        "Boosting the margin: {A} new explanation for the
+                 effectiveness of voting methods",
+  journal =      "The Annals of Statistics",
+  volume =       "26",
+  number =       "5",
+  pages =        "1651--1686",
+  year =         "1998",
+}
+
+@InProceedings{schapire99theoretical,
+  author =       "Robert E. Schapire",
+  booktitle =    "Algorithmic Learning Theory, 10th International
+                 Conference, {ALT} '99, Tokyo, Japan, December 1999,
+                 Proceedings",
+  title =        "Theoretical Views of Boosting and Applications",
+  volume =       "1720",
+  publisher =    "Springer",
+  pages =        "13--25",
+  year =         "1999",
+  URL =          "http:citeseer.ist.psu.edu/article/schapire99theoretical.html",
+}
+
+@InProceedings{SchapireSinger98,
+  author =       "R. E. Schapire and Y. Singer",
+  booktitle =    "Proceedings of the 11th Annual Conference on
+                 Computational Learning Theory",
+  title =        "Improved Boosting Algorithms Using Confidence Rated
+                 Predictions",
+  year =         "1998",
+}
+
+@Book{SchBurSmo99,
+  author =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
+  title =        "Advances in Kernel Methods --- Support Vector
+                 Learning",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "1999",
+}
+
+@InProceedings{ScheinA2001,
+  author =       "Andrew I. Schein and Alexandrin Popescul and Lyle H.
+                 Ungar and David M. Pennock",
+  booktitle =    "Workshop on Recommender Systems at SIGIR",
+  title =        "Generative Models for Cold-Start Recommendations",
+  year =         "2001",
+}
+
+@InProceedings{ScheinA2002,
+  author =       "Andrew I. Schein and Alexandrin Popescul and Lyle H.
+                 Ungar and David M. Pennock",
+  booktitle =    "SIGIR '02",
+  title =        "Methods and metrics for cold-start recommendations",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "253--260",
+  year =         "2002",
+}
+
+@InCollection{Scheines94,
+  author =       "R. Scheines",
+  editor =       "P. Cheeseman and R. W. Oldford",
+  booktitle =    "Selecting Models from Data: Artificial Intelligence
+                 and Statistics {IV}",
+  title =        "Inferring causal structure among unmeasured
+                 variables",
+  publisher =    "Springer-Verlag",
+  pages =        "197--204",
+  year =         "1994",
+}
+
+@InProceedings{Schenkel93,
+  author =       "M. Schenkel and H. Weissman and I. Guyon and C. Nohl
+                 and D. Henderson",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Recognition-Based Segmentation of On-Line Hand-Printed
+                 Words",
+  address =      "Denver, CO",
+  pages =        "723--730",
+  year =         "1993",
+}
+
+@Article{schenkel95,
+  author =       "M. Schenkel and I. Guyon and D. Henderson",
+  title =        "On-line Cursive Script Recognition using Time Delay
+                 Neural Networks and Hidden {Markov} Models",
+  journal =      "{Machine} {Vision} and {Applications}",
+  publisher =    "Springer Verlag",
+  pages =        "215--223",
+  year =         "1995",
+}
+
+@InProceedings{SchGra03,
+  author =       "Nicol N. Schraudolph and Thore Graepel",
+  editor =       "Christopher M. Bishop and Brendan J. Frey",
+  booktitle =    "Proc.\ 9th Intl.\ Workshop Artificial Intelligence and
+                 Statistics (AIstats)",
+  title =        "Combining Conjugate Direction Methods with Stochastic
+                 Approximation of Gradients",
+  publisher =    "Society for Artificial Intelligence and Statistics",
+  address =      "Key West, Florida",
+  pages =        "7--13",
+  year =         "2003",
+  ISBN =         "0-9727358-0-1",
+  abstract =     "The method of conjugate directions provides a very
+                 effective way to optimize large, deterministic systems
+                 by gradient descent. In its standard form, however, it
+                 is not amenable to stochastic approximation of the
+                 gradient. Here we explore ideas from conjugate gradient
+                 in the stochastic (online) setting, using fast
+                 Hessian-gradient products to set up low-dimensional
+                 Krylov subspaces within individual mini-batches. In our
+                 benchmark experiments the resulting online learning
+                 algorithms converge orders of magnitude faster than
+                 ordinary stochastic gradient descent.",
+}
+
+@Article{Schmidhuber92,
+  author =       "J{\"u}rgen Schmidhuber",
+  title =        "Learning Complex, Extended Sequences using the
+                 Principle of History Compression",
+  journal =      nc,
+  volume =       "4",
+  number =       "2",
+  pages =        "234--242",
+  year =         "1992",
+}
+
+@Article{Schmidhuber96,
+  author =       "J{\"u}rgen Schmidhuber",
+  title =        "Sequential Neural Text Compression",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "7",
+  number =       "1",
+  pages =        "142--146",
+  year =         "1996",
+}
+
+@InCollection{Schmidt-2006,
+  author =       "Volker Schmidt",
+  booktitle =    "Lecture Notes, Summer 2006",
+  title =        {Markov Chains and Monte-Carlo Simulation},
+  address =      "Ulm University, Department of Stochastics",
+  year =         "2006",
+  URL =          "http://www.mathematik.uni-ulm.de/stochastik/lehre/ss06/markov/skript-engl/skript-engl.htm",
+}
+
+@Article{Schmitt-2002,
+  author =       "M. Schmitt",
+  title =        "Descartes' Rule of Signs for Radial Basis Function
+                 Neural Networks",
+  journal =      "Neural Computation",
+  volume =       "14",
+  number =       "12",
+  pages =        "2997--3011",
+  year =         "2002",
+}
+
+@Article{Schneider-2001,
+  author =       "Tapio Schneider",
+  title =        "Analysis of Incomplete Climate Data: Estimation of
+                 Mean Values and Covariance Matrices and Imputation of
+                 Missing Values",
+  journal =      "Journal of Climate",
+  volume =       "14",
+  pages =        "853--871",
+  year =         "2001",
+}
+
+@article{Schneidman+al-2003,
+    address = {Department of Molecular Biology, Princeton University, Princeton, New Jersey 08544, USA.},
+    author = {Schneidman, E.  and Bialek, W.  and Berry, M. J. },
+    issn = {1529-2401},
+    journal = {Journal of Neuroscience},
+    month = {December},
+    number = {37},
+    pages = {11539--11553},
+    title = {Synergy, redundancy, and independence in population codes},
+    url = {http://www.jneurosci.org/cgi/content/abstract/23/37/11539},
+    volume = {23},
+    year = {2003}
+}
+    
+
+@Article{schoelkopf97comparing,
+  author =       "B. Sch{\"o}lkopf and K. Sung and C. Burges and F.
+                 Girosi and P. Niyogi and T. Poggio and V. Vapnik",
+  title =        "Comparing support vector machines with {G}aussian
+                 kernels to radial basis function classifiers",
+  journal =      "IEEE Transactions on Signal Processing",
+  volume =       "45",
+  pages =        "2758--2765",
+  year =         "1997",
+  text =         "Sch{\"o}lkopf, B., Sung, K., Burges, C., Girosi, F.,
+                 Niyogi, P., Poggio, T., and Vapnik, V.: Comparing
+                 support vector machines with {G}aussian kernels to radial
+                 basis function classifiers. IEEE Transactions on Signal
+                 Processing, 45 (1997) 2758-2765.",
+}
+
+@Book{Scholkopf02-book,
+  author =       "B. Sch{\"o}lkopf and A. J. Smola",
+  title =        "Learning with Kernels: Support Vector Machines,
+                 Regularization, Optimization and Beyond",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+}
+
+@TechReport{Scholkopf96,
+  author =       "B. Sch{\"o}lkopf and A. Smola and K.-R. M{\"u}ller",
+  title =        "Nonlinear Component Analysis as a Kernel Eigenvalue
+                 Problem",
+  number =       "44",
+  institution =  "Max Planck Institute for Biological Cybernetics,
+                 Tübingen, Germany",
+  year =         "1996",
+}
+
+@Article{Scholkopf98,
+  author =       "B. Sch{\"o}lkopf and A. Smola and K.-R. M{\"u}ller",
+  title =        "Nonlinear component analysis as a kernel eigenvalue
+                 problem",
+  journal =      "Neural Computation",
+  volume =       "10",
+  pages =        "1299--1319",
+  year =         "1998",
+}
+
+@Book{Scholkopf98-book,
+  author =       "B. Sch{\"o}lkopf and C. J. C. Burges and A. J. Smola",
+  title =        "Advances in kernel methods: support vector learning",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "1998",
+}
+
+@Article{Scholkopf99,
+  author =       "B. Sch{\"o}lkopf and S. Mika and C. Burges and P.
+                 Knirsch and K.-R. M{\"u}ller and G. R{\"a}tsch and A.
+                 Smola",
+  title =        "Input Space Versus Feature Space in Kernel-Based Methods",
+  journal =      "IEEE Trans. Neural Networks",
+  volume =       "10",
+  number =       "5",
+  pages =        "1000--1017",
+  year =         "1999",
+}
+
+@Article{Schraudolph02,
+  author =       "Nicol N. Schraudolph",
+  title =        "Fast Curvature Matrix-Vector Products for Second-Order
+                 Gradient Descent",
+  journal =      "Neural Computation",
+  volume =       "14",
+  number =       "7",
+  pages =        "1723--1738",
+  year =         "2002",
+}
+
+@InProceedings{Schraudolph99,
+  author =       "Nicol N. Schraudolph",
+  booktitle =    "Proceedings of the 9th International Conference on
+                 Artificial Neural Networks",
+  title =        "Local gain adaptation in stochastic gradient descent",
+  pages =        "569--574",
+  year =         "1999",
+}
+
+@InProceedings{Schutze92,
+  author =       "Hinrich Sch{\"u}tze",
+  booktitle =    "Supercomputing'92",
+  title =        "Dimensions of Meaning",
+  address =      "Minneapolis MN",
+  pages =        "787--796",
+  year =         "1992",
+}
+
+@InProceedings{Schutze93,
+  author =       "H. Schutze",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Word space",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  pages =        "895--902",
+  year =         "1993",
+}
+
+@Misc{Schuurmans1999,
+  author =       "Dale Schuurmans",
+  title =        "Greedy importance sampling: {A} new Monte Carlo
+                 inference method",
+  year =         "1999",
+  URL =          "citeseer.nj.nec.com/25013.html",
+}
+
+@InProceedings{Schuurmans2000,
+  author =       "Dale Schuurmans and Finnegan Southey",
+  title =        "Monte Carlo inference via greedy importance sampling",
+  pages =        "523--532",
+  year =         "2000",
+  URL =          "citeseer.nj.nec.com/281712.html",
+}
+
+@Article{Schuurmans2001,
+  author =       "D. Schuurmans and F. Southey",
+  title =        "Metric-based methods for adaptive model selection and
+                 regularization",
+  journal =      "Machine Learning",
+  volume =       "48",
+  number =       "1",
+  pages =        "51--84",
+  year =         "2002",
+}
+
+@InProceedings{Schuurmans97,
+  author =       "D. Schuurmans",
+  booktitle =    "Proceedings of the National Conference on Artificial
+                 Intelligence (AAAI-97)",
+  title =        "A new metric-based approach to model selection",
+  pages =        "552--558",
+  year =         "1997",
+}
+
+@Article{Schwartz90,
+  author =       "D. B. Schwartz and V. K. Samalam and S. A. Solla and
+                 J. S. Denker",
+  title =        "Exhaustive Learning",
+  journal =      nc,
+  volume =       "2",
+  pages =        "371--382",
+  year =         "1990",
+}
+
+@Article{Schwenk+Bengio00,
+  author =       "Holger Schwenk and Yoshua Bengio",
+  title =        "Boosting Neural Networks",
+  journal =      "Neural Computation",
+  volume =       "12",
+  number =       "8",
+  pages =        "1869--1887",
+  year =         "2000",
+}
+
+@InProceedings{Schwenk+Gauvain-2005,
+  author =       "Holger Schwenk and Jean-Luc Gauvain",
+  booktitle =    "Interspeech",
+  title =        "Building continuous space language models for
+                 transcribing European languages",
+  pages =        "737--740",
+  year =         "2005",
+}
+
+@InProceedings{Schwenk+Gauvain2002,
+  author =       "H. Schwenk and J-L. Gauvain",
+  booktitle =    icassp,
+  title =        "Connectionist Language Modeling for Large Vocabulary
+                 Continuous Speech Recognition",
+  address =      "Orlando, Florida",
+  pages =        "765--768",
+  year =         "2002",
+}
+
+@InProceedings{Schwenk+Gauvain2002-short,
+  author =       "H. Schwenk and J-L. Gauvain",
+  booktitle =    {Int. Conf. Acoust. Speech \& Sig. Proc.},
+  title =        "Connectionist Language Modeling for Large Vocabulary
+                 Continuous Speech Recognition",
+  address =      "Orlando, Florida",
+  pages =        "765--768",
+  year =         "2002",
+}
+
+@InProceedings{Schwenk05C,
+  author =       "Holger Schwenk and Jean-Luc Gauvain",
+  booktitle =    "Joint Human Language Technology Conference and
+                 Conference on Empirical Methods in Natural Language
+                 Processing (EMNLP)",
+  title =        "Training Neural Network Language Models On Very Large
+                 Corpora",
+  address =      "Vancouver",
+  pages =        "201--208",
+  month =        oct,
+  year =         "2005",
+  URL =          "ftp://tlp.limsi.fr/public/emnlp05.pdf",
+}
+
+@InProceedings{Schwenk05C-small,
+  author =       "Holger Schwenk and Jean-Luc Gauvain",
+  booktitle =    "EMNLP'2005",
+  title =        "Training Neural Network Language Models On Very Large
+                 Corpora",
+  pages =        "201--208",
+  year =         "2005",
+}
+
+@TechReport{Schwenk:2001:tr,
+  author =       "Holger Schwenk",
+  title =        "Language Modeling in the Continuous Domain",
+  number =       "2001-20",
+  institution =  "LIMSI-CNRS, Orsay, France",
+  year =         "2001",
+  date =         "dec 2001",
+}
+
+@InProceedings{Schwenk:2002:icassp,
+  author =       "Holger Schwenk and Jean-Luc Gauvain",
+  booktitle =    icassp,
+  title =        "Connectionist Language Modeling for Large Vocabulary
+                 Continuous Speech Recognition",
+  volume =       "1",
+  pages =        "765--768",
+  year =         "2002",
+}
+
+@InProceedings{Schwenk:2003:sspr,
+  author =       "Holger Schwenk and Jean-Luc Gauvain",
+  booktitle =    "ISCA \& IEEE Workshop on Spontaneous Speech Processing
+                 and Recognition",
+  title =        "{Using Continuous Space Language Models for
+                 Conversational Speech Recognition}",
+  address =      "Tokyo",
+  month =        apr,
+  year =         "2003",
+}
+
+@InProceedings{Schwenk:2004:icslp,
+  author =       "Holger Schwenk and Jean-Luc Gauvain",
+  booktitle =    icslp,
+  title =        "Using a Continuous Space Language Model for
+                 Conversational Speech Recognition",
+  year =         "2004",
+  note =         "submitted",
+}
+
+@InProceedings{Schwenk:2004:ijcnn,
+  author =       "Holger Schwenk",
+  booktitle =    ijcnn,
+  title =        "Efficient Training of Large Neural Networks for
+                 Language Modeling",
+  volume =       "4",
+  pages =        "3050--3064",
+  year =         "2004",
+}
+
+@InProceedings{SchYuGue07,
+  author =       "Nicol N. Schraudolph and Jin Yu and Simon G{\"u}nter",
+  booktitle =    "Proc.\ 11th Intl.\ Conf.\ Artificial Intelligence and
+                 Statistics (AIstats)",
+  title =        "A Stochastic Quasi-{Newton} Method for Online Convex
+                 Optimization",
+  publisher =    "Society for Artificial Intelligence and Statistics",
+  address =      "San Juan, Puerto Rico",
+  pages =        "433--440",
+  year =         "2007",
+  ISBN =         "0-9727358-2-8",
+}
+
+@InProceedings{Scofield88,
+  author =       "C. L. Scofield",
+  booktitle =    icnn,
+  title =        "Learning Internal Representations in the Coulomb
+                 Energy Network",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "271--276",
+  year =         "1988",
+}
+
+@InProceedings{Scott+al-2003,
+  author =       "Scott S. L. Piao and Paul Rayson and Dawn Archer and
+                 Andrew Wilson and Tony McEnery",
+  booktitle =    "Proceedings of the ACL 2003 workshop on Multiword
+                 expressions",
+  title =        "Extracting multiword expressions with a semantic
+                 tagger",
+  publisher =    "Association for Computational Linguistics",
+  address =      "Morristown, NJ, USA",
+  pages =        "49--56",
+  year =         "2003",
+}
+
+@Book{Scott92,
+  author =       "D. W. Scott",
+  title =        "Multivariate Density Estimation: Theory, Practice, and
+                 Visualization",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1992",
+}
+
+@Article{ScST95,
+  author =       "A. Schaerf and S. Yoav and M. Tennenholtz",
+  title =        "Adaptive load balancing: a study in multi-agent
+                 learning",
+  journal =      "Journal of Artificial Intelligence Research",
+  volume =       "2",
+  pages =        "475--500",
+  year =         "1995",
+}
+
+@Article{Scudder65,
+  author = 	 "{Henry J. Scudder, III}",
+  title = 	 {Probability of Error of Some Adaptive Pattern-Recognition Machines},
+  journal = 	 {IEEE Transactions on Information Theory},
+  year = 	 1965,
+  volume =	 11,
+  pages =	 {363-371}
+}
+
+@TechReport{Seeger-2005,
+  author =       "Matthias Seeger",
+  title =        "Low Rank Updates for the {Cholesky} Decomposition",
+  institution =  "Department of EECS, University of California at
+                 Berkeley",
+  year =         "2005",
+}
+
+@InProceedings{Seeger-Williams-Lawrence-2003,
+  author =       "M. Seeger and C. Williams and N. Lawrence",
+  booktitle =    "Workshop on AI and Statistics",
+  title =        "Fast Forward Selection to Speed Up Sparse {G}aussian
+                 Process Regression",
+  volume =       "9",
+  year =         "2003",
+}
+
+@TechReport{Seeger2001,
+  author =       "M. Seeger",
+  title =        "Learning with labeled and unlabeled data",
+  institution =  "Edinburgh University",
+  year =         "2001",
+}
+
+@InProceedings{seidl91p1,
+  author =       "D. R. Seidl and D. Lorenz",
+  booktitle =    ijcnn,
+  title =        "A structure by which a recurrent neural network can
+                 approximate a nonlinear dynamic system",
+  volume =       "2",
+  pages =        "709--714",
+  month =        jul,
+  year =         "1991",
+}
+
+@TechReport{Sejnowski+Rosenberg86,
+  author =       "T. J. Sejnowski and C. R. Rosenberg",
+  key =          "Sejnowski",
+  title =        "{\em NETtalk: A parallel network that learns to read
+                 aloud}",
+  type =         "Technical Report 86-01",
+  institution =  "Department of Electrical Engineering and Computer
+                 Science, Johns Hopkins University, Baltimore, MD.",
+  year =         "1986",
+}
+
+@Article{Sejnowski86,
+  author =       "T. J. Sejnowski and P. K. Kienker and G. Hinton",
+  title =        "Learning Symmetry Groups with Hidden Units: Beyond the
+                 Perceptron",
+  journal =      physicaD,
+  volume =       "22",
+  pages =        "260--275",
+  year =         "1986",
+}
+
+@Article{Sejnowski87,
+  author =       "T. J. Sejnowski and C. R. Rosenberg",
+  title =        "Parallel Networks that Learn to Pronounce English
+                 Text",
+  journal =      cs,
+  volume =       "1",
+  pages =        "145--168",
+  year =         "1987",
+}
+
+@InProceedings{Seneff84,
+  author =       "S. Seneff",
+  booktitle =    icassp,
+  title =        "Pitch and spectral estimation of speech based on an
+                 auditory synchrony model",
+  pages =        "",
+  year =         "1984",
+}
+
+@TechReport{Seneff85,
+  author =       "S. Seneff",
+  title =        "Pitch and spectral estimation of speech based on an
+                 auditory synchrony model",
+  number =       "RLE Technical report no. 504",
+  institution =  "LRE",
+  address =      "Cambridge, MA: MIT Press",
+  year =         "1985",
+}
+
+@InProceedings{Seneff86,
+  author =       "S. Seneff",
+  booktitle =    icassp,
+  title =        "A computational model for the peripheral auditory
+                 system: application to speech recognition research",
+  pages =        "1983--1986",
+  year =         "1986",
+}
+
+@Article{Seneff88,
+  author =       "S. Seneff",
+  title =        "A joint synchrony/mean-rate model of auditory speech
+                 processing",
+  journal =      "Journal of Phonetics",
+  volume =       "16",
+  pages =        "55--76",
+  year =         "1988",
+}
+
+@Book{Seneta-81,
+  author =       "E. Seneta",
+  title =        "Nonnegative Matrices and {Markov} Chains",
+  publisher =    "Springer",
+  address =      "New York",
+  year =         "1981",
+}
+
+@Article{senseval-2000,
+  author =       "Adam Kilgarrif and Joseph Rosenzweig",
+  title =        "Framework and results for English {SENSEVAL}",
+  journal =      "Computers and the Humanities: special issue on
+                 {SENSEVAL}",
+  volume =       "34",
+  pages =        "15--48",
+  year =         "2000",
+}
+
+@Article{Serbedzija-1996,
+  author =       "Nikola B. {\v{S}}erbed{\v{z}}ija",
+  title =        "Simulating Artificial Neural Networks on Parallel
+                 Architectures",
+  journal =      "Computer",
+  volume =       "29",
+  number =       "3",
+  publisher =    "IEEE Computer Society Press",
+  address =      "Los Alamitos, CA, USA",
+  pages =        "56--63",
+  year =         "1996",
+  ISSN =         "0018-9162",
+  doi =          "http://dx.doi.org/10.1109/2.485893",
+}
+
+@Article{Serre2007,
+  author =       "T. Serre and G. Kreiman and M. Kouh and C. Cadieu and
+                 U. Knoblich and T. Poggio",
+  title =        "A quantitative theory of immediate visual
+                 recognition",
+  journal =      "Progress in Brain Research, Computational
+                 Neuroscience: Theoretical Insights into Brain
+                 Function",
+  volume =       "165",
+  pages =        "33--56",
+  year =         "2007",
+}
+
+@Article{Serre2007-small,
+  author =       "T. Serre and G. Kreiman and M. Kouh and C. Cadieu and
+                 U. Knoblich and T. Poggio",
+  title =        "A quantitative theory of immediate visual
+                 recognition",
+  journal =      "Progress in Brain Res., Comput.
+                 Neurosc.",
+  volume =       "165",
+  pages =        "33--56",
+  year =         "2007",
+}
+
+@article{Serre-Wolf-2007,
+  author = {Thomas Serre and Lior Wolf and Stanley Bileschi and Maximilian Riesenhuber},
+  note = {Member-Poggio, Tomaso},
+  title = {Robust Object Recognition with Cortex-Like Mechanisms},
+  journal = {IEEE Trans. Pattern Anal. Mach. Intell.},
+  volume = {29},
+  number = {3},
+  year = {2007},
+  issn = {0162-8828},
+  pages = {411--426},
+  doi = {http://dx.doi.org/10.1109/TPAMI.2007.56},
+  publisher = {IEEE Computer Society},
+  address = {Washington, DC, USA},
+}
+
+
+@INPROCEEDINGS{SeungS1998,
+    author = {Sebastian H. Seung},
+    title = {Learning continuous attractors in recurrent networks},
+    editor =       NIPS10ed,
+    booktitle =    NIPS10,
+    year = {1998},
+    pages = {654--660},
+    publisher = {MIT Press}
+}
+
+@INPROCEEDINGS{Jain-Seung-08,
+    author = {Viren Jain and Sebastian H. Seung},
+    title = {Natural Image Denoising with Convolutional Networks},
+    editor =       NIPS21ed,
+    booktitle =    NIPS21,
+    year = {2008},
+}
+
+@inproceedings{Sha+Saul-2005,
+    author = {Fei Sha and Lawrence K. Saul},
+    title = {Analysis and extension of spectral methods for nonlinear dimensionality reduction},
+    booktitle = {Proceedings of the 22nd International Conference on Machine Learning},
+    year = {2005},
+    isbn = {1-59593-180-5},
+    pages = {784--791},
+    location = {Bonn, Germany},
+    doi = {http://doi.acm.org/10.1145/1102351.1102450},
+    publisher = {ACM},
+    address = {New York, NY},
+}
+
+@article{Shannon-1949,
+    Author = {C. E. Shannon},
+    Title = {Communication in the presence of noise},
+    Journal = {{Proceedings of the Institute of Radio Engineers}},
+    Volume = {37},
+    number = 1,
+    Pages = {10--21},
+    Year = {1949}
+}
+
+@Article{shapiro00lift,
+  author =       "Gregory Piatetsky-Shapiro and Sam Steingold",
+  title =        "Measuring lift quality in database marketing",
+  journal =      "SIGKDD Explor. Newsl.",
+  volume =       "2",
+  number =       "2",
+  publisher =    "ACM Press",
+  address =      "New York, NY, USA",
+  pages =        "76--80",
+  year =         "2000",
+  ISSN =         "1931-0145",
+}
+
+@InProceedings{shardanand95,
+  author =       "Upendra Shardanand and Pattie Maes",
+  booktitle =    "CHI '95: Proceedings of the SIGCHI conference on Human
+                 factors in computing systems",
+  title =        "{Social information filtering: algorithms for
+                 automating ``word of mouth''}",
+  publisher =    "ACM Press/Addison-Wesley Publishing Co.",
+  pages =        "210--217",
+  year =         "1995",
+  location =     "Denver, Colorado, United States",
+}
+
+@article{Sharma-2000,
+    title = {Induction of Visual Orientation Modules in Auditory Cortex},
+    author = {J. Sharma and A. Angelucci and M. Sur},
+    journal = {Nature},
+    pages = {841--847},
+    volume = {404},
+    year = {2000},
+} 
+
+@Article{Sharpe-64,
+  author =       "W. F. Sharpe",
+  title =        "Capital Asset Prices: {A} Theory of Market Equilibrium
+                 under Conditions of Risk",
+  journal =      "Journal of Finance",
+  volume =       "19",
+  pages =        "425--442",
+  year =         "1964",
+}
+
+@Article{Sharpe-66,
+  author =       "W. F. Sharpe",
+  title =        "Mutual Fund Performance",
+  journal =      "Journal of Business",
+  volume =       "39",
+  number =       "1",
+  pages =        "119--138",
+  year =         "1966",
+}
+
+@InProceedings{Shaw+Jebara-2007,
+  author =       "Blake Shaw and Tony Jebara",
+  booktitle =    aistats07,
+  title =        "Minimum Volume Embedding",
+  publisher =    "Omnipress",
+  date =         "March 21-24, 2007",
+  address =      "San Juan, Porto Rico",
+  year =         "2007",
+}
+
+@InProceedings{Shawe-Taylor+Cristianini+Kandola-2002,
+  author =       "J. Shawe-Taylor and N. Cristianini and J. Kandola",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "On the concentration of spectral properties",
+  publisher =    "{MIT} Press",
+  year =         "2002",
+}
+
+@InProceedings{Shawe-Taylor+Williams-2003,
+  author =       "J. Shawe-Taylor and C. K. I. Williams",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "The Stability of Kernel Principal Components Analysis
+                 and its Relation to the Process Eigenspectrum",
+  publisher =    "{MIT} Press",
+  year =         "2003",
+}
+
+@Article{Shawe-Taylor98,
+  author =       "John Shawe-Taylor and Peter Bartlett and Robert
+                 Williamson and Martin Anthony",
+  title =        "Structural Risk Minimization over Data-Dependent
+                 Hierarchies",
+  journal =      "IEEE Transactions on Information Theory",
+  volume =       "44",
+  number =       "5",
+  pages =        "1926--1940",
+  year =         "1998",
+}
+
+@Article{Sherrington75,
+  author =       "D. Sherrington and S. Kirkpatrick",
+  title =        "Solvable Model of a Spin Glass",
+  journal =      prl,
+  volume =       "35",
+  pages =        "1792--1796",
+  year =         "1975",
+}
+
+@Article{Shi+Malik-2000,
+  author =       "Jianbo Shi and Jitendra Malik",
+  title =        "Normalized Cuts and Image Segmentation",
+  journal =      "IEEE Transactions on Pattern Analysis and Machine
+                 Intelligence (PAMI)",
+  year =         "2000",
+}
+
+@InProceedings{Shi+Malik-97,
+  author =       "J. Shi and J. Malik",
+  booktitle =    cvpr97,
+  title =        "Normalized cuts and image segmentation",
+  pages =        "731--737",
+  year =         "1997",
+}
+
+@InProceedings{Shimohara88,
+  author =       "K. Shimohara and T. Uchiyama and Y. Tokunaga",
+  booktitle =    icnn,
+  title =        "Back-Propagation Networks for Event-Driven Temporal
+                 Sequence Processing",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "665--672",
+  year =         "1988",
+}
+
+@InProceedings{Shimohata+al-1997,
+  author =       "Sayori Shimohata and Toshiyuki Sugio and Junji
+                 Nagata",
+  booktitle =    "Proceedings of the 35th Conference of the Association
+                 for Computational Linguistics",
+  title =        "Retrieving Collocations by Co-occurrences and Word
+                 Order Constraints",
+  address =      "Madrid",
+  pages =        "476--481",
+  year =         "1997",
+}
+
+@inproceedings{shin:1991,
+    title = {The Pi-Sigma Network: An Efficient Higher-Order Neural Network for
+        Pattern Classification and Function Approximation},
+    author = {Yoan Shin and Joydeep Ghosh},
+    crossref = {IJCNN:1991},
+}
+@proceedings{IJCNN:1991,
+    title = {International Joint Conference on Neural Networks ({IJCNN})},
+    booktitle = ijcnn,
+    year = {1991},
+    address = {Seattle, Washington, USA},
+}
+
+@article{ShmulevichI2002,
+	author = {Ilya Shmulevich and Wei Zhang},
+	journal = {Bioinformatics},
+	number = {4},
+	pages = {555--565},
+	title = {Binary analysis and optimization-based normalization of gene expression data},
+	volume = {18},
+	year = {2002}
+}
+
+@Article{short81optimal,
+  author =       "R. D. Short and K. Fukunaga",
+  title =        "The optimal distance measure for nearest neighbor
+                 classification",
+  journal =      "IEEE Transactions on Information Theory",
+  volume =       "27",
+  pages =        "622--627",
+  year =         "1981",
+}
+
+@InProceedings{ShrikiO2001,
+  author =       "Oren Shriki and Haim Sompolinsky and Daniel D. Lee",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "An Information Maximization Approach to Overcomplete
+                 and Recurrent Representations",
+  publisher =    "{MIT} Press",
+  pages =        "933--938",
+  year =         "2001",
+}
+
+@InProceedings{ShrikiO2001-small,
+  author =       "Oren Shriki and Haim Sompolinsky and Daniel D. Lee",
+  booktitle =    "NIPS 13",
+  title =        "An Information Maximization Approach to Overcomplete
+                 and Recurrent Representations",
+  year =         "2001",
+}
+
+@Article{Shumway82,
+  author =       "R. H. Shumway and D. S. Stoffer",
+  title =        "An approach to time series smoothing and forecasting
+                 using the {EM} algorithm",
+  journal =      "Journal of Time Series Analysis",
+  volume =       "3",
+  number =       "4",
+  pages =        "253--264",
+  year =         "1982",
+}
+
+@Article{Shumway91,
+  author =       "R. H. Shumway and D. S. Stoffer",
+  title =        "Dynamic linear models with switching",
+  journal =      "J. Amer. Stat. Assoc.",
+  volume =       "86",
+  pages =        "763--769",
+  year =         "1991",
+}
+
+@Article{Sichel91,
+  author =       "D. E. Sichel",
+  title =        "Business cycle duration dependence: a parametric
+                 approach",
+  journal =      "Review of Economics and Statistics",
+  volume =       "71",
+  pages =        "245--260",
+  year =         "1991",
+}
+
+@TechReport{Siegelmann92,
+  author =       "H. T. Siegelmann and E. D. Sontag",
+  title =        "Neural Networks with Real Weighs: Analog Computational
+                 Complexity",
+  number =       "SYCON-92-05",
+  institution =  "Rutgers Center for System and Control",
+  address =      "New Brunswick, NJ",
+  month =        sep,
+  year =         "1992",
+}
+
+@InProceedings{Sietsma88,
+  author =       "J. Sietsma and R. J. F. Dow",
+  booktitle =    icnn,
+  title =        "Neural Net Pruning---Why and How",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "325--333",
+  year =         "1988",
+}
+
+@InProceedings{silver95,
+  author =       "Daniel L. Silver and Robert E. Mercer",
+  booktitle =    "Proceedings of the INNS World Congress on Neural
+                 Networks",
+  title =        "Toward a Model of Consolidation: The Retention and
+                 Transfer of Neural Net Task Knowledge",
+  volume =       "3",
+  address =      "Washington, DC",
+  pages =        "164--169",
+  month =        jul,
+  year =         "1995",
+}
+
+@Article{silver96,
+  author =       "Daniel L. Silver and Robert E. Mercer",
+  title =        "The Parallel Transfer of Task Knowledge Using Dynamic
+                 Learning Rates Based on a Measure of Relatedness",
+  journal =      "Connection Science, Special issue on Transfer in
+                 Inductive Systems",
+  volume =       "8",
+  number =       "2",
+  pages =        "277--294",
+  year =         "1996",
+}
+
+@TechReport{silver97,
+  author =       "Daniel L. Silver and Robert E. Mercer and Gilbert A.
+                 Hurwitz",
+  title =        "The Functional Transfer of Knowledge for Coronary
+                 Artery Disease Diagnosis",
+  number =       "513",
+  institution =  "Department of Computer Science, University of Western
+                 Ontario",
+  month =        jan,
+  year =         "1997",
+}
+
+@InCollection{Silverman-encyc86,
+  author =       "B. W. Silverman",
+  editor =       "N. L. Johnson and S. Kotz",
+  booktitle =    "Encyclopaedia of Statistical Sciences",
+  title =        "Penalized Likelihood",
+  volume =       "6",
+  publisher =    "Wiley, New York",
+  pages =        "664--667",
+  year =         "1986",
+}
+
+@Book{Silverman86,
+  author =       "Bernard W. Silverman",
+  title =        "Density Estimation for Statistics and Data Analysis",
+  publisher =    "Chapman and Hall",
+  address =      "London",
+  year =         "1986",
+}
+
+@InProceedings{Silverman88,
+  author =       "R. H. Silverman and A. S. Noetzel",
+  editor =      nips87ed,
+  booktitle =    nips87,
+  title =        "Time-Sequential Self-Organization of Hierarchical
+                 Neural Networks",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "709--714",
+  year =         "1988",
+}
+
+@InProceedings{simard-03,
+  author =       "D. Simard and P. Y. Steinkraus and J. C. Platt",
+  booktitle =    ICDAR03,
+  title =        "Best Practices for Convolutional Neural Networks",
+  year =         "2003",
+  isbn =         {0-7695-1960-1},
+  pages =        {958},
+  publisher =    {IEEE Computer Society},
+  address =      {Washington, DC, USA},
+  doi =          "http://doi.ieeecomputersociety.org/10.1109/ICDAR.2003.1227801",
+}
+
+@InProceedings{Simard89,
+  author =       "P. Y. Simard and M. B. Ottaway and D. H. Ballard",
+  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
+  booktitle =    cmss88,
+  title =        "Analysis of Recurrent Backpropagation",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Pittsburg 1988",
+  pages =        "103--112",
+  year =         "1989",
+}
+
+@InProceedings{Simard92,
+  author =       "Patrice Simard and Bernard Victorri and Yann LeCun
+                 and John Denker",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Tangent Prop - {A} formalism for specifying selected
+                 invariances in an adaptive network",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "895--903",
+  year =         "1992",
+}
+
+@InProceedings{Simard93,
+  author =       "P. Y. Simard and Y. {LeCun} and J. Denker",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Efficient pattern recognition using a new
+                 transformation distance",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  pages =        "50--58",
+  year =         "1993",
+}
+
+@Article{Simard98,
+  author =       "P. Y. Simard and Y. A. {LeCun} and J. S. Denker and B.
+                 Victorri",
+  title =        "Transformation Invariance in Pattern Recognition ---
+                 Tangent Distance and Tangent Propagation",
+  journal =      "Lecture Notes in Computer Science",
+  volume =       "1524",
+  year =         "1998",
+  CODEN =        "LNCSD9",
+  ISSN =         "0302-9743",
+  bibdate =      "Tue Jan 5 08:21:58 1999",
+  acknowledgement = ack-nhfb,
+  OPTpages =     "239--??",
+}
+
+@InProceedings{Simard-nips92,
+  author =       "P. Simard and Y. {LeCun}",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Reverse {TDNN}: An Architecture for Trajectory
+                 Generation",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "579--588",
+  year =         "1992",
+}
+
+@PhdThesis{Simard-PhD,
+  author =       "P. Y. Simard",
+  title =        "Learning State Space Dynamics in Recurrent Networks",
+  school =       "University of Rochester",
+  address =      "Rochester, NY",
+  year =         "1991",
+  note =         "Tech. Rep. 383",
+}
+
+@Article{Simic90,
+  author =       "P. D. Simic",
+  title =        "Statistical Mechanics As the Underlying Theory of
+                 ``Elastic'' and ``Neural'' Optimizations",
+  journal =      network,
+  volume =       "1",
+  pages =        "89--103",
+  year =         "1990",
+}
+
+@article{Simoncelli+al-1992,
+    author = "Eero P. Simoncelli and William T. Freeman and Edward H. Adelson and David J. Heeger", 
+    title = "Shiftable Multi-scale Transforms", 
+    journal = "IEEE Transactions on Informations Theory", 
+    volume = "38", 
+    number = "2", 
+    year = "1992", 
+    publisher = "The IEEE Computer Society", 
+}
+
+@InProceedings{Simoncelli97,
+  author =       "E. P. Simoncelli",
+  booktitle =    "Proc. 31st Asilomar Conference on Signals, Systems and
+                 Computers",
+  title =        "Statistical Models for Images: Compression,
+                 Restoration and Synthesis",
+  publisher =    "IEEE",
+  year =         "1997",
+}
+
+@InProceedings{Simoncelli99,
+  author =       "E. P. Simoncelli",
+  booktitle =    "Proc. SPIE,44th annual meeting",
+  title =        "Modeling the Joint Statistics of Images in the Wavelet
+                 Domain",
+  volume =       "3813",
+  publisher =    "SPIE",
+  year =         "1999",
+}
+
+@Article{Sinex+Geisler83,
+  author =       "D. G. Sinex and C. D. Geisler",
+  title =        "Response of auditory nerve fibers to consonant-vowel
+                 syllables",
+  journal =      jasa,
+  volume =       "73",
+  number =       "2",
+  pages =        "602--615",
+  year =         "1983",
+}
+
+@Article{Singer,
+  author =       "A. Singer",
+  title =        "Implementations of Artificial Neural Networks on the
+                 Connection Machine",
+  journal =      "Parallel Computing",
+  volume =       "14",
+  pages =        "305--315",
+  year =         "1990",
+  OPTnote =      "",
+}
+
+@InProceedings{Singer-1990,
+  author =       "Alexander Singer",
+  booktitle =    "Proceedings of the International Neural Networks
+                 Conference",
+  title =        "Exploiting the Inherent Parallelism of Artificial
+                 Neural Networks to Achieve 1300 Million Interconnects
+                 per Second",
+  pages =        "656--660",
+  year =         "1990",
+}
+
+@InProceedings{singer00leveraged,
+  author =       "Y. Singer",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "Leveraged vector machines",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "610--616",
+  year =         "2000",
+}
+
+@InProceedings{Singer96,
+  author =       "Y. Singer",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Adaptive Mixtures of Probabilistic Transducers",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@Article{Singer97,
+  author =       "Y. Singer",
+  title =        "Adaptive Mixtures of Probabilistic Transducers",
+  journal =      "Neural Computation",
+  volume =       "9",
+  number =       "8",
+  year =         "1997",
+}
+
+@InProceedings{singer:1996:nips,
+  author =       "Y. Singer",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Adaptive Mixtures of Probabilistic Transducers",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@InProceedings{Singh92,
+  author =       "S. P. Singh",
+  booktitle =    "Proceedings of the 10th National Conference on
+                 Artificial Intelligence",
+  title =        "Reinforcement learning with a hierarchy of abstract
+                 models",
+  publisher =    "MIT/AAAI Press",
+  pages =        "202--207",
+  year =         "1992",
+}
+
+@InProceedings{SinkkonenJ2002,
+  author =       "Janne Sinkkonen and Samuel Kaski and Janne
+                 Nikkil{\"{a}}",
+  booktitle =    ECML02,
+  title =        "Discriminative Clustering: Optimal Contingency Tables
+                 by Learning Metrics",
+  publisher =    "Springer-Verlag",
+  address =      "London, UK",
+  pages =        "418--430",
+  year =         "2002",
+  ISBN =         "3-540-44036-4",
+}
+
+@TechReport{Sirat90,
+  author =       "J.-A. Sirat and J.-P. Nadal",
+  title =        "Neural Trees: {A} New Tool for Classification",
+  type =         "Preprint",
+  institution =  "Laboratoires d'Electronique Philips",
+  address =      "Limeil-Bre\'vannes, France",
+  year =         "1990",
+}
+
+@InProceedings{SiroshJ1994,
+  author =       "Joseph Sirosh and Risto Miikkulainen",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Ocular Dominance and Patterned Lateral Connections in
+                 a Self-Organizing Model of the Primary Visual Cortex",
+  publisher =    "Morgan Kaufmann",
+  pages =        "109--116",
+  year =         "1994",
+}
+
+@InProceedings{SiroshJ1994-small,
+  author =       "J. Sirosh and R. Miikkulainen",
+  booktitle =    "NIPS 6",
+  title =        "Ocular Dominance and Patterned Lateral Connections in
+                 a Self-Organizing Model of the Primary Visual Cortex",
+  year =         "1994",
+}
+
+@InProceedings{Sivilotti87,
+  author =       "M. A. Sivilotti and M. A. Mahowald and C. A. Mead",
+  editor =       "P. Losleben",
+  booktitle =    "Advanced Research in VLSI: Proceedings of the 1987
+                 Stanford Conference",
+  title =        "Real-Time Visual Computations Using Analog {CMOS}
+                 Processing Arrays",
+  publisher =    "MIT Press, Cambridge",
+  pages =        "295--312",
+  year =         "1987",
+}
+
+@TechReport{Sjoberg92,
+  author =       "Jonas Si{\"o}berg and Lennart Ljung",
+  title =        "Overtraining, Regularization, and Searching for
+                 Minimum in Neural Networks",
+  institution =  "Link{\"o}ping University",
+  address =      "S-581 83 Link{\"o}ping, Sweden",
+  year =         "1992",
+}
+
+@article{Sjoberg95,
+  title={{Overtraining, regularization and searching for a minimum, with application to neural networks}},
+  author={Sj{\"o}berg, J. and Ljung, L.},
+  journal={International Journal of Control},
+  volume={62},
+  number={6},
+  pages={1391--1407},
+  year={1995},
+  publisher={Taylor \& Francis}
+}
+
+@Article{Skinner1958,
+  author =       "Burrhus F. Skinner",
+  title =        "Reinforcement Today",
+  journal =      "American Psychologist",
+  volume =       "13",
+  pages =        "94--99",
+  year =         "1958",
+}
+
+@PhdThesis{Small1980,
+  author =       "Steven L. Small",
+  title =        "Word Expert Parsing: {A} Theory of Distributed
+                 Word-Based Natural Language Understanding",
+  school =       "University of Maryland",
+  year =         "1980",
+}
+
+@Article{smilde97,
+  author =       "A. K. Smilde",
+  title =        "Comments on multilinear {PLS}",
+  journal =      "Journal of Chemometrics",
+  volume =       "11",
+  pages =        "367--377",
+  year =         "1997",
+}
+
+@Article{Smith+Waterman81,
+  author =       "T. F. Smith and W. S. Waterman",
+  title =        "Identification of common molecular subsequences",
+  journal =      "Journal of Molecular Biology",
+  volume =       "147",
+  pages =        "195--197",
+  year =         "1981",
+}
+
+@Article{Smith95,
+  author =       "S. P. Smith",
+  title =        "Differentiation of the Cholesky algorithm",
+  journal =      "Journal of Computational and Graphical Statistics",
+  volume =       "4",
+  pages =        "134--147",
+  year =         "1995",
+}
+
+@InProceedings{smola00sparsegreedy,
+  author =       "A. J. Smola and B. Sch{\"o}lkopf",
+  editor =       "P. Langley",
+  booktitle =    "International Conference on Machine Learning",
+  title =        "Sparse greedy matrix approximation for machine
+                 learning",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Francisco",
+  pages =        "911--918",
+  year =         "2000",
+}
+
+@InProceedings{Smola2000sparsegreedy,
+  author =       "A. J. Smola and P. Bartlett",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "Sparse Greedy {G}aussian Process Regression",
+  year =         "2001",
+}
+
+@InProceedings{Smola99semiparametricSVM,
+  author =       "A. J. Smola and T. Friess and B. {Sch\"olkopf}",
+  editor =       NIPS11ed,
+  booktitle =    NIPS11,
+  title =        "Semiparametric Support Vector and Linear Programming
+                 Machines",
+  publisher =    "MIT Press",
+  pages =        "585--591",
+  year =         "1999",
+  OPTaddress =   "Cambridge, MA",
+  OPTannote =    "",
+  OPTcrossref =  "",
+  OPTkey =       "",
+  OPTmonth =     "",
+  OPTnote =      "",
+  OPTnumber =    "",
+  OPTorganization = "",
+  OPTseries =    "",
+}
+
+@InCollection{Smolensky86,
+  author =       "Paul Smolensky",
+  editor =       "D. E. Rumelhart and J. L. McClelland",
+  booktitle =    pdp,
+  title =        "Information Processing in Dynamical Systems:
+                 Foundations of Harmony Theory",
+  chapter =      "6",
+  volume =       "1",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  pages =        "194--281",
+  year =         "1986",
+}
+
+@Article{Smyth94,
+  author =       "P. Smyth",
+  title =        {Hidden Markov models for fault detection in dynamic
+                 systems},
+  journal =      "Pattern Recognition",
+  volume =       "27",
+  number =       "1",
+  pages =        "149--164",
+  year =         "1994",
+}
+
+@Article{Smyth97,
+  author =       "P. Smyth and D. Heckerman and M. I. Jordan",
+  title =        {Probabilistic independence networks for hidden Markov
+                 probability models},
+  journal =      "Neural Computation",
+  volume =       "9",
+  number =       "2",
+  pages =        "227--269",
+  year =         "1997",
+}
+
+@InProceedings{Smyth97-nips,
+  author =       "P. Smyth",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        {Clustering sequences with hidden Markov models},
+  publisher =    "MIT Press",
+  year =         "1997",
+}
+
+@Article{Smyth98,
+  author =       "P. Smyth",
+  title =        {Belief Networks, Hidden Markov Models, and Markov
+                 Random Fields: a Unifying View},
+  journal =      "Pattern Recognition Letters",
+  year =         "1998",
+}
+
+@TechReport{Snapp+Venkatesh-1998,
+  author =       "Robert R. Snapp and Santosh S. Venkatesh",
+  title =        "Asymptotic derivation of the finite-sample risk of the
+                 k nearest neighbor classifier",
+  number =       "UVM-CS-1998-0101",
+  institution =  "Department of Computer Science, University of
+                 Vermont",
+  year =         "1998",
+}
+
+@InCollection{SNE-nips15,
+  author =       "G. E. Hinton and S. Roweis",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Stochastic Neighbor Embedding",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2003",
+}
+
+@InProceedings{Snow+al-2006,
+  author =       "Rion Snow and Daniel Jurafsky and Andrew Y. Ng",
+  booktitle =    "Proceedings of COLING/ACL 2006",
+  title =        "Semantic taxonomy induction from heterogenous
+                 evidence",
+  year =         "2006",
+}
+
+@book{SocietyNeuro-2006,
+    author = "{Society for Neuroscience}",
+    title = "Brain Facts: A Primer on the Brain and Nervous System",
+    year = 2006,
+    edition = "Fifth",
+    note = "{http://sfn.org}",
+}
+
+@Article{Soffer86,
+  author =       "B. H. Soffer and G. J. Dunning and Y. Owechko and E.
+                 Marom",
+  title =        "Associative Holographic Memory with Feedback Using
+                 Phase-Conjugate Mirrors",
+  journal =      optlett,
+  volume =       "11",
+  pages =        "118--120",
+  year =         "1986",
+}
+
+@Article{Sola94,
+  author =       "M. Sola and J. Driffill",
+  title =        "Testing the term structure of interest rates using a
+                 stationary vector autoregression with regime
+                 switching",
+  journal =      "Journal of Economic Dynamics and Control",
+  volume =       "18",
+  pages =        "601--628",
+  year =         "1994",
+}
+
+@Article{Solla88,
+  author =       "S. A. Solla and E. Levin and M. Fleisher",
+  title =        "Accelerated Learning in Layered Neural Networks",
+  journal =      cs,
+  volume =       "2",
+  pages =        "625--639",
+  year =         "1988",
+}
+
+@InProceedings{Solla89,
+  author =       "S. A. Solla",
+  editor =       "L. Personnaz and G. Dreyfus",
+  booktitle =    "Neural Networks from Models to Applications",
+  title =        "Learning and Generalization in Layered Neural
+                 Networks: The Contiguity Problem",
+  publisher =    "I.D.S.E.T., Paris",
+  address =      "Paris 1988",
+  pages =        "168--177",
+  year =         "1989",
+}
+
+@Article{Solomonoff64,
+  author =       "Ray J. Solomonoff",
+  title =        "A formal theory of inductive inference",
+  journal =      "Information and Control",
+  volume =       "7",
+  pages =        "1--22, 224--254",
+  year =         "1964",
+}
+
+@Article{Sompolinsky86,
+  author =       "H. Sompolinsky and I. Kanter",
+  title =        "Temporal Association in Asymmetric Neural Networks",
+  journal =      prl,
+  volume =       "57",
+  pages =        "2861--2864",
+  year =         "1986",
+}
+
+@InProceedings{Sompolinsky87,
+  author =       "H. Sompolinsky",
+  editor =       "J. L. van Hemmen and I. Morgenstern",
+  booktitle =    "Heidelberg Colloquium on Glassy Dynamics",
+  title =        "The Theory of Neural Networks: The Hebb Rules and
+                 Beyond",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Heidelberg 1986",
+  pages =        "485--527",
+  year =         "1987",
+}
+
+@Article{Sompolinsky88,
+  author =       "H. Sompolinsky and A. Crisanti and H. J. Sommers",
+  title =        "Chaos in Random Neural Networks",
+  journal =      prl,
+  volume =       "61",
+  pages =        "259--262",
+  year =         "1988",
+}
+
+@Article{Sondik73,
+  author =       "E. J. Sondik",
+  title =        "The optimal control of partially observable Markov
+                 processes over the finite horizon",
+  journal =      "Operations Research",
+  volume =       "11",
+  pages =        "1071--1088",
+  year =         "1973",
+}
+
+@Article{Sondik78,
+  author =       "E. J. Sondik",
+  title =        "The optimal control of partially observable Markov
+                 processes over the infinite horizon: discounted case",
+  journal =      "Operations Research",
+  volume =       "26",
+  pages =        "282--304",
+  year =         "1978",
+}
+
+@misc{Song+al-2008a,
+    author = {Yangqiu Song and Feiping Nie and Changshui Zhang},
+    title = {Semi-Supervised Sub-Manifold Discriminant Analysis},
+    note = {Pattern Recognition Letter},
+    year = 2008,
+}
+
+@article{Song+al-2008b,
+    author = {Yangqiu Song and Feiping Nie and Changshui Zhang and Shiming Xiang},
+    title = {A Unified Framework for Semi-Supervised Dimensionality Reduction},
+    journal = {Pattern Recognition},
+    volume = 41,
+    number = 9,
+    pages = {2789--2799},
+    year = 2008,
+}
+
+@incollection{Song+al-2008c,
+    title = {Colored Maximum Variance Unfolding},
+    author = {Le Song and Alex Smola and Karsten Borgwardt and Arthur Gretton},
+    editor =       NIPS20ed,
+    booktitle =    NIPS20,
+    publisher = {MIT Press},
+    address = {Cambridge, MA},
+    pages = {1385--1392},
+    year = {2008}
+}
+
+@Article{Sontag-cs89,
+  author =       "E. D. Sontag and H. J. Sussman",
+  title =        "Backpropagation Can Give Rise to Spurious Local Minima
+                 Even for Networks without Hidden Layers",
+  journal =      "Complex Systems",
+  volume =       "3",
+  pages =        "91--106",
+  year =         "1989",
+}
+
+@InProceedings{Sontag-ijcnn89,
+  author =       "E. D. Sontag and H. J. Sussman",
+  booktitle =    ijcnn,
+  title =        "Backpropagation Separates when Perceptrons Do",
+  publisher =    "IEEE Press",
+  address =      "Washington DC",
+  year =         "1989",
+  OPTpages =     "639--642",
+}
+
+@TechReport{sontag92t1,
+  author =       "E. D. Sontag",
+  title =        "Systems Combining Linearity and Saturations and
+                 Relations to Neural Networks",
+  number =       "SYCON--92--01",
+  institution =  "Rutgers Center for Systems and Control",
+  year =         "1992",
+}
+
+@Article{Soukoulis83,
+  author =       "C. M. Soukoulis and K. Levin and G. S. Grest",
+  title =        "Irreversibility and Metastability in Spin-Glasses.
+                 {I}. Ising Model",
+  journal =      prB,
+  volume =       "28",
+  pages =        "1495--1509",
+  year =         "1983",
+}
+
+@Article{Specht90,
+  author =       "D. F. Specht",
+  title =        "Probabilistic Neural Networks",
+  journal =      nn,
+  volume =       "3",
+  pages =        "109--118",
+  year =         "1990",
+}
+
+@Article{Specht91,
+  author =       "D. F. Specht",
+  title =        "A General Regression Neural Network",
+  journal =      "IEEE Trans. Neural Networks",
+  volume =       "2",
+  number =       "6",
+  pages =        "568--576",
+  month =        nov,
+  year =         "1991",
+}
+
+@Article{Spiegelhalter93,
+  author =       "D. J. Spiegelhalter and A. P. Dawid and S. L.
+                 Lauritzen and R. G. Cowell",
+  title =        "Bayesian Analysis in Expert Systems",
+  journal =      "Statistical Science",
+  volume =       "8",
+  pages =        "219--283",
+  year =         "1993",
+}
+
+@InProceedings{Spielman-96,
+  author =       "D. Spielman and S. Teng",
+  booktitle =    "Proceedings of the 37th Annual Symposium on
+                 Foundations of Computer Science",
+  title =        "Spectral partitioning works: planar graphs and finite
+                 element meshes",
+  year =         "1996",
+}
+
+@TechReport{Spielman-96b,
+  author =       "Daniel A. Spielman and Shang-Hua Teng",
+  title =        "Spectral Partitioning Works: Planar Graphs and Finite
+                 Element Meshes",
+  number =       "UCB CSD-96-898",
+  institution =  "U.C. Berkeley",
+  year =         "1996",
+}
+
+@ARTICLE{spirkovska:1990,
+    author={Spirkovska, L. and Reid, M. B.},
+    title={Connectivity Strategies for Higher-Order Neural Networks Applied to
+        Pattern Recognition},
+    journal=ijcnn,
+    year={1990},
+    month={June},
+    volume={1},
+    number={},
+    pages={21--26},
+    keywords={computerised pattern recognition, neural netsconnection
+        strategies, higher-order neural networks, interconnections, pattern
+            recognition, pattern-recognition, regional connectivity},
+    doi={10.1109/IJCNN.1990.137538},
+    ISSN={}, 
+}
+
+
+@Book{Spirtes-book93,
+  author =       "P. Spirtes and C. Glymour and R. Scheines",
+  title =        "Causation, Prediction, and Search",
+  publisher =    "Springer-Verlag, New York",
+  year =         "1993",
+}
+
+@Article{Spirtes-Glymour91,
+  author =       "P. Spirtes and C. Glymour",
+  title =        "An algorithm for fast recovery of sparse causal
+                 graphs",
+  journal =      "Social Science Computing Reviews",
+  volume =       "9",
+  number =       "1",
+  pages =        "62--72",
+  year =         "1991",
+}
+
+@InProceedings{Srebro-Jaakkola,
+  author =       "N. Srebro and T. Jaakkola",
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  title =        "Weighted Low-Rank Approximations",
+  address =      "Washington, D.C.",
+  pages =        "720--727",
+  year =         "2003",
+}
+
+@Book{SSL-Book-2006,
+  author =       "Olivier Chapelle and Bernhard. Sch{\"{o}}lkopf and Alexander Zien",
+  title =        "Semi-Supervised Learning",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2006",
+}
+
+@Article{Steels2003,
+  author =       "L. Steels",
+  title =        "Evolving grounded communication for robots",
+  journal =      "Trends in Cognitive Science",
+  volume =       "7",
+  number =       "7",
+  pages =        "308--312",
+  month =        jul,
+  year =         "2003",
+  URL =          "http://www.csl.sony.fr/downloads/papers/2003/steels-03c.pdf",
+}
+
+@Article{Steinbuch61,
+  author =       "K. Steinbuch",
+  title =        "Die Lernmatrix",
+  journal =      kyb,
+  volume =       "1",
+  pages =        "36--45",
+  year =         "1961",
+}
+
+@Article{SteinhausH1956,
+  author = 	 {Hugo Steinhaus},
+  title = 	 {Sur la division des corps mat\'eriels en parties},
+  journal = 	 {Bulletin L'Acad\'emie Polonaise des Sciences},
+  year = 	 {1956},
+  volume = 	 {4},
+  pages = 	 {801-804},
+}
+
+@InCollection{Stevens+Blumstein81,
+  author =       "K. N. Stevens and S. E. Blumstein",
+  editor =       "P. D. Eimas and J. L. Miller",
+  booktitle =    "Perspectives on the study of speech",
+  title =        "The search for invariant acoustic correlates of
+                 phonetic features",
+  publisher =    "Lawrence Erlbaum ass.",
+  pages =        "1--38",
+  year =         "1981",
+}
+
+@InCollection{Stevens75,
+  author =       "K. N. Stevens",
+  editor =       "G. Fant and M. A. Tatham",
+  booktitle =    "Auditory analysis and perception of speech",
+  title =        "The potential role of properties detectors in the
+                 perception of consonants",
+  publisher =    "Academic Press, London",
+  pages =        "303--330",
+  year =         "1975",
+}
+
+@Article{Stevenson90,
+  author =       "M. Stevenson and R. Winter and B. Widrow",
+  title =        "Sensitivity of Feedforward Neural Networks to Weight
+                 Errors",
+  journal =      "IEEE. Trans. on Neural Networks",
+  volume =       "1",
+  number =       "1",
+  pages =        "71--80",
+  month =        mar,
+  year =         "1990",
+  keywords =     "neural network fault tolerance robustness reliability
+                 adaline weight errors",
+}
+
+@Book{Stewart-1998,
+  author =       "G. W. Stewart",
+  title =        "Matrix Algorithms, Volume {I}: Basic Decompositions",
+  publisher =    "SIAM",
+  address =      "Philadelphia",
+  year =         "1998",
+}
+
+@Book{Stewart73,
+  author =       "G. W. Stewart",
+  title =        "Introduction to matrix computations",
+  publisher =    "Academic Press",
+  year =         "1973",
+}
+
+@InProceedings{Stinchcombe+White89,
+  author =       "M. Stinchcombe and H. White",
+  booktitle =    ijcnn,
+  title =        "Universal approximation using feedforward networks
+                 with non-sigmoid hidden layer activation function",
+  publisher =    "IEEE",
+  address =      "Washington DC",
+  pages =        "613--617",
+  year =         "1989",
+}
+
+@TechReport{Stokbro90,
+  author =       "K. Stokbro and D. K. Umberger and J. A. Hertz",
+  title =        "Exploiting Neurons with Localized Receptive Fields to
+                 Learn Chaos",
+  type =         "Preprint",
+  number =       "90/28 S",
+  institution =  "Nordita",
+  address =      "Copenhagen, Denmark",
+  year =         "1990",
+}
+
+@InProceedings{Stolcke-ICSLP02,
+  author =       "A. Stolcke",
+  booktitle =    "Proceedings of the International Conference on
+                 Statistical Language Processing",
+  title =        "{SRILM} - An extensible language modeling toolkit",
+  address =      "Denver, Colorado",
+  year =         "2002",
+}
+
+@InProceedings{Stolcke93,
+  author =       "A. Stolcke and S. Omohundro",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Hidden {Markov} model induction by {Bayesian} model
+                 merging",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "11--18",
+  year =         "1993",
+}
+
+@TechReport{Stolcke94a,
+  author =       "A. Stolcke and S. M. Omohundro",
+  title =        "Best-first Model Merging for Hidden {Markov} Model
+                 Induction",
+  number =       "TR-94-003",
+  institution =  "International Computer Science Institute",
+  address =      "Berkeley, CA",
+  month =        jan,
+  year =         "1994",
+}
+
+@TechReport{Stolcke94b,
+  author =       "A. Stolcke and J. Segal",
+  title =        "Precise n-gram Probabilities from Stochastic
+                 Context-free Grammars",
+  number =       "TR-94-007",
+  institution =  "International Computer Science Institute",
+  address =      "Berkeley, CA",
+  month =        jan,
+  year =         "1994",
+}
+
+@Article{Stone-80,
+  author =       "C. J. Stone",
+  title =        "Optimal rates of convergence for nonparametric
+                 estimators",
+  journal =      "Annals of Statistics",
+  volume =       "8",
+  number =       "6",
+  pages =        "1348--1360",
+  year =         "1980",
+}
+
+@Article{Stormo82,
+  author =       "G. D. Stormo and T. D. Schneider and L. Gold and A.
+                 Ehrenfeucht",
+  title =        "Use of the perceptron algorithm to distinguish
+                 translational initiation sites in {\it {E}. {Coli}}",
+  journal =      "Nucleic Acid Research",
+  volume =       "10",
+  pages =        "2997--3010",
+  year =         "1982",
+}
+
+@InProceedings{Stornetta88,
+  author =       "W. S. Stornetta and T. Hogg and B. A. Huberman",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "A Dynamical Approach to Temporal Pattern Processing",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "750--759",
+  year =         "1988",
+}
+
+@Book{Strang80,
+  author =       "G. Strang",
+  title =        "Linear Algebra and Its Applications",
+  publisher =    "Academic Press",
+  address =      "New York",
+  year =         "1980",
+}
+
+@PhdThesis{Suaudeau94,
+  author =       "N. Suaudeau",
+  title =        "Un mod\`ele probabiliste pour int\'egrer la dimension
+                 temporelle dans un syst\`eme de reconnaissance
+                 automatique de la parole",
+  school =       "Universit\'e de Rennes I",
+  address =      "France",
+  year =         "1994",
+}
+
+@Article{suddarth91,
+  author =       "Steven C. Suddarth and Alistair D. C. Holden",
+  title =        "Symbolic-neural systems and the use of hints for
+                 developing complex systems",
+  journal =      "Int. J. Man-Mach. Stud.",
+  volume =       "35",
+  number =       "3",
+  publisher =    "Academic Press Ltd.",
+  address =      "London, UK",
+  pages =        "291--311",
+  year =         "1991",
+}
+
+@article{Sudderth-2007,
+ author = {Erik B. Sudderth and Antonio Torralba and William T. Freeman and Alan S. Willsky},
+ title = {Describing visual scenes using transformed objects and parts},
+ journal = {Int. Journal of Computer Vision},
+ volume = 77,
+ publisher = {Springer},
+ pages = "291--330",
+ year = "2007",
+}
+
+@article{Sugiyama-2007,
+    author = {Masashi Sugiyama},
+    title = {Dimensionality reduction of multimodal labeled data by local {F}isher discriminant analysis},
+    journal = jmlr,
+    year = {2007},
+    volume = {8},
+    pages = {1027--1061}
+}
+
+@InProceedings{Sun-ijcnn90,
+  author =       "G. Z. Sun and H. H. Chen and Y. C. Lee and C. L
+                 Giles",
+  booktitle =    ijcnn,
+  title =        "Recurrent Neural Networks, Hidden {Markov} Models and
+                 Stochastic Grammars",
+  volume =       "I",
+  address =      "San Diego CA",
+  pages =        "729--734",
+  year =         "1990",
+}
+
+@Book{Sundararajan+Saratchandran-1998,
+  author =       "N. Sundararajan and P. Saratchandran",
+  title =        "Parallel Architectures for Artificial Neural Networks:
+                 Paradigms and Implementations",
+  publisher =    "IEEE Computer Society Press",
+  address =      "Los Alamitos, CA",
+  year =         "1998",
+  ISBN =         "0-8186-8399-6",
+}
+
+@InProceedings{Sutskever+Hinton-2007,
+  author =       "Ilya Sutskever and Geoffrey E. Hinton",
+  booktitle =    aistats07,
+  title =        "Learning Multilevel Distributed Representations for
+                 High-Dimensional Sequences",
+  publisher =    "Omnipress",
+  date =         "March 21-24, 2007",
+  address =      "San Juan, Porto Rico",
+  year =         "2007",
+}
+
+@Article{Sutskever+Hinton-2008,
+  author =       "Ilya Sutskever and Geoffrey E. Hinton",
+  title =        "Deep Narrow Sigmoid Belief Networks are Universal
+                 Approximators",
+  journal =      "Neural Computation",
+  volume =       "to appear",
+  year =         "2008",
+}
+
+@Book{Sutton+Barto-98,
+  author =       "Richard Sutton and Andrew Barto",
+  title =        "Reinforcement Learning: An Introduction",
+  publisher =    "MIT Press",
+  year =         "1998",
+}
+
+@InCollection{sutton06introduction,
+  author =       "Charles Sutton and Andrew McCallum",
+  editor =       "Lise Getoor and Ben Taskar",
+  booktitle =    "Introduction to Statistical Relational Learning",
+  title =        "An Introduction to Conditional Random Fields for
+                 Relational Learning",
+  publisher =    "MIT Press",
+  year =         "2006",
+  note =         "",
+  URL =          "publications/crf-tutorial.pdf",
+  tags =         "recent",
+}
+
+@PhdThesis{Sutton84,
+  author =       "R. S. Sutton",
+  title =        "Temporal Credit Assignment in Reinforcement Learning",
+  school =       "University of Massachusetts",
+  address =      "Amherst",
+  year =         "1984",
+}
+
+@Article{Sutton88,
+  author =       "R. S. Sutton",
+  title =        "Learning to Predict by the Methods of Temporal
+                 Differences",
+  journal =      mlearn,
+  volume =       "3",
+  pages =        "9--44",
+  year =         "1988",
+}
+
+@InCollection{Sutton91,
+  author =       "R. S. Sutton and A. G. Barto",
+  editor =       "M. Gabriel and J. W. Moore",
+  booktitle =    "Learning and Computational Neuroscience",
+  title =        "Time Derivative Models of Pavlovian Reinforcement",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1991",
+}
+
+@InProceedings{Sutton95,
+  author =       "R. S. Sutton",
+  booktitle =    "Proceedings of the 12th International Conference on
+                 Machine Learning",
+  title =        "{TD} models: modeling the world at a mixture of time
+                 scales",
+  publisher =    "Morgan Kaufmann",
+  year =         "1995",
+}
+
+@InProceedings{Szu86,
+  author =       "H. Szu",
+  editor =       "J. S. Denker",
+  booktitle =    snowbird,
+  title =        "Fast Simulated Annealing",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Snowbird 1986",
+  pages =        "420--425",
+  year =         "1986",
+}
+
+@InProceedings{Szummer+Jaakkola-2002,
+  author =       "M. Szummer and T. Jaakkola",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Partially labeled classification with Markov random
+                 walks",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  year =         "2002",
+}
+
+
+@article{Takabatake+al-2007,
+    author = {Hiroki Takabatake and Manabu Kotani and Seiichi Ozawa},
+    title = {Feature extraction by supervised independent component analysis based on category information},
+    journal = {Electrical Engineering in Japan},
+    volume = 161,
+    number = 2,
+    pages = {25--32},
+    year = 2007,
+}
+
+@InProceedings{TakahashiN2001,
+  author =       "Naoto Takahashi and Minoru Motoki and Yoshio Shimazu
+                 and Yoichi Tomiura and Tory Hitaka",
+  booktitle =    "Proceedings of the Second Workshop on Natural Language
+                 Processing and Neural Networks",
+  title =        "{PP}-attachment Ambiguity Resolution Using a Neural
+                 Network with Modified {FGREP} Method",
+  address =      "Tokyo",
+  year =         "2001",
+}
+
+@InProceedings{Takens81,
+  author =       "F. Takens",
+  editor =       "D. A. Rand and L.-S. Young",
+  booktitle =    "Dynamical Systems and Turbulenc",
+  title =        "Detecting Strange Attractors In Turbulence",
+  volume =       "898",
+  publisher =    "Springer-Verlag, Berlin",
+  address =      "Warwick 1980",
+  pages =        "366--381",
+  year =         "1981",
+  series =       "Lecture Notes in Mathematics",
+}
+
+@Article{Takeuchi79,
+  author =       "A. Takeuchi and S. Amari",
+  title =        "Formation of Topographic Maps and Columnar
+                 Microstructures in Nerve Fields",
+  journal =      biocyb,
+  volume =       "35",
+  pages =        "63--72",
+  year =         "1979",
+}
+
+@InCollection{Tam+Perkel89,
+  author =       "Tam D. C. and Perkel D. H.",
+  editor =       "Hawkins R. D. and Bower G. H.",
+  booktitle =    "Computational Models of Learning in Simple Neural
+                 Systems",
+  title =        "Quantitative modeling of synaptic plasticity",
+  publisher =    "Academic Press",
+  pages =        "1--30",
+  year =         "1989",
+}
+
+@Article{Tank86,
+  author =       "D. W. Tank and J. J. Hopfield",
+  title =        "Simple ``Neural'' Optimization Networks: An {A}/{D}
+                 Converter, Signal Decision Circuit, and a Linear
+                 Programming Circuit",
+  journal =      ieeetcas,
+  volume =       "33",
+  pages =        "533--541",
+  year =         "1986",
+}
+
+@Article{Tank87a,
+  author =       "D. W. Tank and J. J. Hopfield",
+  title =        "Neural Computation by Time Compression",
+  journal =      PNAS,
+  volume =       "84",
+  pages =        "1896--1900",
+  year =         "1987",
+}
+
+@InProceedings{Tank87b,
+  author =       "D. W. Tank and J. J. Hopfield",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Concentrating Information in Time: Analog Neural
+                 Networks with Applications to Speech Recognition
+                 Problems",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "455--468",
+  year =         "1987",
+}
+
+@Book{Tanner1993,
+  author =       "M. Tanner",
+  title =        "Tools for statistical inference: Methods for
+                 exploration of posterior distributions and likelihood
+                 functions",
+  publisher =    "Springer",
+  address =      "New York",
+  year =         "1993",
+}
+
+@Article{Tappert90,
+  author =       "C. Tappert and C. Suen and T. Wakahara",
+  title =        "The state of the art in on-line handwriting
+                 recognition",
+  journal =      ieeetpami,
+  volume =       "8",
+  number =       "12",
+  pages =        "787--808",
+  year =         "1990",
+}
+
+@InCollection{Taylor+2007,
+  author =       "Graham Taylor and Geoffrey E. Hinton and Sam Roweis",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Modeling Human Motion Using Binary Latent Variables",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "1345--1352",
+  year =         "2007",
+}
+
+%%FRED: I deprecate this one as the years in the tag is not the one for the publication but the conference!
+@InProceedings{Taylor2006,
+  author =       "Graham Taylor and Geoffrey E. Hinton and Sam Roweis",
+  editor =       NIPS19ed,
+  booktitle =    NIPS19,
+  title =        "Modeling Human Motion Using Binary Latent Variables",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "1345--1352",
+  year =         "2007",
+}
+
+@InProceedings{Taylor2006-small,
+  author =       "Graham Taylor and Geoffrey E. Hinton and Sam Roweis",
+  booktitle =    "NIPS 20",
+  title =        "Modeling Human Motion Using Binary Latent Variables",
+  year =         "2006",
+}
+
+@InProceedings{TaylorHintonICML2009,
+  author =    {Graham Taylor and Geoffrey Hinton},
+  title =     {Factored Conditional Restricted {Boltzmann} Machines for Modeling Motion Style},
+  booktitle = {Proceedings of the 26th International Conference on Machine Learning (ICML'09)},
+  pages =     {1025--1032},
+  year =      2009,
+  editor =    {L\'{e}on Bottou and Michael Littman},
+  address =   {Montreal},
+  month =     {June},
+  publisher = {Omnipress}
+}
+
+@InProceedings{Taylor56,
+  author =       "W. K. Taylor",
+  editor =       "C. Cherry",
+  booktitle =    "Information Theory",
+  title =        "Electrical Simulation of Some Nervous System
+                 Functional Activities",
+  publisher =    "Butterworths, London",
+  address =      "London 1985",
+  pages =        "314--328",
+  year =         "1956",
+}
+
+@InProceedings{Tebelskis91,
+  author =       "J. Tebelskis and A. Waibel and B. Petek and O.
+                 Schmidbauer",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "Continuous Speech Recognition Using Linked Predictive
+                 Networks",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Denver, CO",
+  pages =        "199--205",
+  year =         "1991",
+}
+
+@Article{Teh-2003,
+  author =       "{Yee Wye} Teh and Max Welling and Simon Osindero and
+                 Geoffrey E. Hinton",
+  title =        "Energy-Based Models for Sparse Overcomplete
+                 Representations",
+  journal =      jmlr,
+  volume =       "4",
+  pages =        "1235--1260",
+  year =         "2003",
+}
+
+@InProceedings{Teh-Roweis-2003,
+  author =       "Y. Whye Teh and S. Roweis",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Automatic Alignment of Local Representations",
+  publisher =    "{MIT} Press",
+  year =         "2003",
+}
+
+@article{TehY2006,
+title=          "Hierarchical {D}irichlet Processes",
+author=         "Y. W. Teh and M. I. Jordan and M. J. Beal and D. M. Blei",
+journal=        "Journal of the American Statistical Association",
+volume=         "101",
+number=         "476",
+pages=          "1566-1581",
+year=           "2006"
+}
+
+@Article{tenenbaum00separating,
+  author =       "Joshua B. Tenenbaum and William T. Freeman",
+  title =        "Separating Style and Content with Bilinear Models",
+  journal =      "Neural Computation",
+  volume =       "12",
+  number =       "6",
+  pages =        "1247--1283",
+  year =         "2000",
+}
+
+@Article{Tenenbaum2000-isomap,
+  author =       "Joshua Tenenbaum and Vin {de Silva} and John C. Langford",
+  title =        "A Global Geometric Framework for Nonlinear
+                 Dimensionality Reduction",
+  journal =      "Science",
+  volume =       "290",
+  number =       "5500",
+  pages =        "2319--2323",
+  month =        dec,
+  year =         "2000",
+}
+
+@Article{Terrell+Scott-1992,
+  author =       "G. R. Terrell and D. W. Scott",
+  title =        "Variable Kernel Density Estimation",
+  journal =      "Annals of Statistics",
+  volume =       "20",
+  pages =        "1236--1265",
+  year =         "1992",
+}
+
+@Article{Tesauro86,
+  author =       "G. Tesauro",
+  title =        "Simple Neural Models of Classical Conditioning",
+  journal =      biocyb,
+  volume =       "55",
+  pages =        "187--200",
+  year =         "1986",
+}
+
+@Article{Tesauro88a,
+  author =       "G. Tesauro and B. Janssens",
+  title =        "Scaling Relationships in Back-Propagation Learning",
+  journal =      cs,
+  volume =       "2",
+  pages =        "39--44",
+  year =         "1988",
+}
+
+@InProceedings{Tesauro88b,
+  author =       "G. Tesauro and T. J. Sejnowski",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "A ``Neural'' Network That Learns to Play Backgammon",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "442--456",
+  year =         "1988",
+}
+
+@Article{Tesauro90,
+  author =       "G. Tesauro",
+  title =        "Neurogammon Wins Computer Olympiad",
+  journal =      nc,
+  volume =       "1",
+  pages =        "321--323",
+  year =         "1990",
+}
+
+@Article{Tesauro92,
+  author =       "G. Tesauro",
+  title =        "Practical issues in temporal difference learning",
+  journal =      "Machine Learning",
+  volume =       "8",
+  pages =        "257--277",
+  year =         "1992",
+}
+
+@Article{tesauro:1994:nc,
+  author =       "G. Tesauro",
+  title =        "{TD-Gammon}, a Self-Teaching Backgammon Program,
+                 Achieves Master-Level Play",
+  journal =      nc,
+  volume =       "6",
+  number =       "2",
+  pages =        "215--219",
+  year =         "1994",
+}
+
+@Article{Thakoor87,
+  author =       "A. P. Thakoor and A. Moopenn and J. Lambe and S. K.
+                 Khanna",
+  title =        "Electronic Hardware Implementations of Neural
+                 Networks",
+  journal =      applopt,
+  volume =       "26",
+  pages =        "5085--5092",
+  year =         "1987",
+}
+
+@InProceedings{THastie95,
+  author =       "Trevor Hastie and Patrice Simard and Eduard
+                 Sackinger",
+  editor =       NIPS7ed,
+  booktitle =    NIPS7,
+  title =        "Learning Prototype Models for Tangent Distance",
+  publisher =    "MIT Press",
+  pages =        "999--1006",
+  year =         "1995",
+}
+
+@Article{THastie98,
+  author =       "T. Hastie and P. Simard",
+  title =        "Metrics and Models for Handwritten Character
+                 Recognition",
+  journal =      "Statistical Science",
+  volume =       "13",
+  number =       "1",
+  pages =        "54--65",
+  month =        jan,
+  year =         "1998",
+  URL =          "citeseer.ist.psu.edu/hastie97metrics.html",
+}
+
+@Book{thrun+pratt-book-1998,
+  editor =       "Sebastian Thrun and Lorien Y. Pratt",
+  title =        "Learning to Learn",
+  publisher =    "Kluwer Academic",
+  year =         "1998",
+}
+
+@InProceedings{Thrun1995,
+  author =       "T. Thrun and T. Mitchell",
+  booktitle =    "Proceedings of the 14th International Joint Conference
+                 on Artificial Intelligence (IJCAI)",
+  title =        "Learning One More Thing",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  month =        aug,
+  year =         "1995",
+}
+
+@Misc{thrun95,
+  author =       "S. Thrun and J. O'Sullivan",
+  title =        "Clustering learning tasks and the selective cross-task
+                 transfer of knowledge",
+  year =         "1995",
+  text =         "Technical Report CMU-CS-95-209, Carnegie Mellon
+                 University, School of Computer Science",
+}
+
+@TechReport{thrun95a,
+  author =       "Sebastian Thrun",
+  title =        "Lifelong Learning: {A} Case Study",
+  number =       "CMU-CS-95-208",
+  institution =  "School of Computer Science, Carnegie Mellon
+                 University",
+  address =      "Pittsburgh, PA 15213",
+  month =        nov,
+  year =         "1995",
+}
+
+@InProceedings{thrun95b,
+  author =       "Sebastian Thrun and Tom M. Mitchell",
+  booktitle =    "Proceedings of IJCAI-95",
+  title =        "Learning One More Thing",
+  organization = "IJCAI",
+  address =      "Montreal, Canada",
+  year =         "1995",
+}
+
+@InProceedings{Thrun96a,
+  author =       "S. Thrun",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Is Learning the $n$-th Thing Any Easier Than Learning
+                 the First?",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "640--646",
+  year =         "1996",
+}
+
+@BOOK{Thrun96b,
+  AUTHOR         = {S. Thrun},
+  YEAR           = {1996},
+  TITLE          = {Explanation-Based Neural Network Learning: A Lifelong 
+                    Learning Approach},
+  PUBLISHER      = {Kluwer Academic Publishers},
+  ADDRESS        = {Boston, MA}
+}
+
+@Article{Tibshirani95,
+  author =       "Robert J. Tibshirani",
+  title =        "Regression shrinkage and selection via the lasso",
+  journal =      "Journal of the Royal Statistical Society B",
+  volume =       "58",
+  pages =        "267--288",
+  year =         "1995",
+}
+
+@Article{Ticknor87,
+  author =       "A. J. Ticknor and H. Barrett",
+  title =        "Optical Implementations of {Boltzmann} Machines",
+  journal =      opteng,
+  volume =       "26",
+  pages =        "16--21",
+  year =         "1987",
+}
+
+@Book{Tikhonov+Arsenin77,
+  author =       "A. N. Tikhonov and V. Y. Arsenin",
+  title =        "Solutions of Ill-posed Problems",
+  publisher =    "W. H. Winston",
+  address =      "Washington D.C.",
+  year =         "1977",
+}
+
+@InProceedings{tipping00relevance,
+  author =       "M. E. Tipping",
+  editor =       NIPS12ed,
+  booktitle =    NIPS12,
+  title =        "The Relevance Vector Machine",
+  publisher =    "MIT Press",
+  pages =        "652--658",
+  year =         "2000",
+  OPTaddress =   "Cambridge, MA",
+}
+
+@Article{tipping99mixtures,
+  author =       "M. E. Tipping and C. M. Bishop",
+  title =        "Mixtures of Probabilistic Principal Component
+                 Analysers",
+  journal =      "Neural Computation",
+  volume =       "11",
+  number =       "2",
+  pages =        "443--482",
+  year =         "1999",
+  URL =          "citeseer.nj.nec.com/tipping98mixtures.html",
+}
+
+@InProceedings{Tishby89,
+  author =       "N. Tishby and E. Levin and S. A. Solla",
+  booktitle =    ijcnn,
+  title =        "Consistent Inference of Probabilities in Layered
+                 Networks: Predictions and Generalization",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "403--410",
+  year =         "1989",
+}
+
+@InProceedings{Titov+Henderson-2007,
+  author =       "Ivan Titov and James Henderson",
+  booktitle =    "Proc. 45th Meeting of Association for Computational
+                 Linguistics (ACL'07)",
+  title =        "Constituent Parsing with Incremental Sigmoid Belief
+                 Networks",
+  address =      "Prague, Czech Republic",
+  pages =        "632--639",
+  year =         "2007",
+  URL =          {http://aclweb.org/anthology-new/P/P07/P07-1080.pdf},
+}
+
+@InProceedings{ToMa00,
+  author =       "Kristina Toutanova and Christopher D. Manning",
+  booktitle =    "EMNLP/VLC 2000",
+  title =        "Enriching the Knowledge Sources Used in a Maximum
+                 Entropy Part-of-Speech Tagger",
+  pages =        "63--70",
+  year =         "2000",
+}
+
+@InProceedings{Tomita82,
+  author =       "M. Tomita",
+  booktitle =    "Proceedings of the Fourth Annual Cognitive Science
+                 Conference",
+  title =        "Dynamic Construction of Finite-state Automata from
+                 Examples Using Hill-Climbing",
+  address =      "Ann Arbor, MI",
+  pages =        "105--108",
+  year =         "1982",
+}
+
+@Book{Tong83,
+  author =       "H. Tong",
+  title =        "Threshold Models in Nonlinear Time Series Analysis",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  year =         "1983",
+}
+
+@InProceedings{TongKoller2000,
+  author =       "S. Tong and D. Koller",
+  booktitle =    "Proceedings of the 17th National Conference on
+                 Artificial Intelligence (AAAI)",
+  title =        "Restricted Bayes Optimal Classifiers",
+  address =      "Austin, Texas",
+  pages =        "658--664",
+  year =         "2000",
+}
+
+@Article{Torgerson52,
+  author =       "W. Torgerson",
+  title =        "Multidimensional scaling, 1: Theory and method",
+  journal =      "Psychometrika",
+  volume =       "17",
+  pages =        "401--419",
+  year =         "1952",
+}
+
+@inproceedings{Torralba+Fergus+Weiss-2008,
+ author = {Antonio Torralba and Robert Fergus and Yair Weiss},
+ title = {Small codes and large databases for recognition},
+ booktitle = cvpr08,
+ pages = "1-8",
+ year = 2008,
+}
+
+@incollection{Torresani+Lee-2007,
+    title = {Large Margin Component Analysis},
+    author = {Lorenzo Torresani and Kuang-Chih Lee},
+    booktitle = NIPS19,
+    editor = NIPS19ed,
+    publisher = {MIT Press},
+    address = {Cambridge, MA},
+    pages = {1385--1392},
+    year = {2007}
+}
+
+@InProceedings{Torresen+al-1995,
+  author =       "J. Torresen and S. Mori and H. Nakashima and S. Tomita
+                 and O. Landsverk",
+  booktitle =    "Proceedings of the Fourth International Conference on
+                 Artificial Neural Networks",
+  title =        "Exploiting multiple degrees of {BP} parallelism on the
+                 highly parallel computer {AP1000}",
+  address =      "Cambridge, UK",
+  pages =        "483--488",
+  year =         "1995",
+}
+
+@InProceedings{Torresen+al-1995b,
+  author =       "J. Torresen and S. Tomita and O. Landsverk",
+  booktitle =    "World Congress on Neural Networks",
+  title =        "The relation of Weight Update Frequency to Convergence
+                 of {BP}",
+  address =      "Washington D.C., USA",
+  year =         "1995",
+}
+
+@Article{Torresen-1997,
+  author =       "Jim Torresen",
+  title =        "The Convergence of Backpropagation Trained Neural
+                 Networks for Various Weight Update Frequencies",
+  journal =      "International Journal of Neural Systems",
+  volume =       "8",
+  number =       "3",
+  year =         "1997",
+}
+
+@Article{Toulouse86,
+  author =       "G. Toulouse and S. Dehaene and J.-P. Changeux",
+  title =        "Spin Glass Model of Learning by Selection",
+  journal =      PNAS,
+  volume =       "83",
+  pages =        "1695--1698",
+  year =         "1986",
+}
+
+@Article{Touretzky89,
+  author =       "D. S. Touretzky and D. A. Pomerleau",
+  title =        "What's Hidden in the Hidden Layers?",
+  journal =      BYTE,
+  pages =        "227--233",
+  month =        aug,
+  year =         "1989",
+}
+
+@InProceedings{ToutanovaKMS03,
+  author =       "Kristina Toutanova and Dan Klein and Christopher D.
+                 Manning and Yoram Singer",
+  booktitle =    "HLT-NAACL",
+  title =        "Feature-Rich Part-of-Speech Tagging with a Cyclic
+                 Dependency Network.",
+  year =         "2003",
+  bibsource =    "DBLP, http://dblp.uni-trier.de",
+  ee =           "http://acl.ldc.upenn.edu/N/N03/N03-1033.pdf",
+}
+
+@InProceedings{Towell-nips92,
+  author =       "G. G. Towell and J. W. Shawlik",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Interpretation of Artificial Neural Networks: Mapping
+                 Knowledge-Based Neural Networks into Rules",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo CA",
+  year =         "1992",
+}
+
+@InProceedings{towell93,
+  author =       "G. G. Towell and J. W. Shavlik",
+  editor =       NIPS4ed,
+  booktitle =    NIPS4,
+  title =        "Interpretation of Artificial Neural Networks: Mapping
+                 Knowledge-Based Neural Networks into rules",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Meteo, CA",
+  pages =        "977--984",
+  year =         "1992",
+}
+
+@InProceedings{Towell-aaai90,
+  author =       "G. G. Towell and J. W. Shawlick and M. O. Noordewier",
+  booktitle =    "Proceedings of the Eighth National Conference on
+                 Artificial Intelligence (AAAI-90)",
+  title =        "Refinement of Approximate Domain Theories by
+                 Knowledge-Based Neural Networks",
+  pages =        "861--866",
+  year =         "1990",
+  OPTnote =      "",
+}
+
+@TechReport{TR:Breiman.arcing,
+  author =       "Leo Breiman",
+  title =        "Bias, variance, and Arcing classifiers",
+  number =       "460",
+  institution =  "Statistics Department, University of California at
+                 Berkeley",
+  year =         "1996",
+}
+
+@TechReport{TR:Breiman:edge,
+  author =       "Leo Breiman",
+  title =        "Arcing the edge",
+  number =       "486",
+  institution =  "Statistics Department, University of California at
+                 Berkeley",
+  year =         "1997",
+}
+
+@TechReport{TR:Breiman:gametheorie,
+  author =       "Leo Breiman",
+  title =        "Prediction games and arcing classifiers",
+  number =       "504",
+  institution =  "Statistics Department, University of California at
+                 Berkeley",
+  year =         "1997",
+}
+
+@TechReport{TR:Friedman+Hastie+Tibshirani:AdaBoost-theory,
+  author =       "J. Friedman and T. Hastie and R. Tibshirani",
+  title =        "Additive Logistic Regression: a Statistical View of
+                 Boosting",
+  institution =  "August 1998, Department of Statistics, Stanford
+                 University",
+  year =         "1998",
+}
+
+@TechReport{TR:Tibshirani:bias+var,
+  author =       "R. Tibshirani",
+  title =        "Bias, Variance and Prediction Error for Classification
+                 Rules",
+  institution =  "Departement od Statistics, University of Toronto",
+  year =         "1996",
+}
+
+@Article{Traven91,
+  author =       "H. G. C. Traven",
+  title =        "A neural network approach to statistical pattern
+                 classification by semiparametric estimation of
+                 probability density functions",
+  journal =      ieeetrnn,
+  volume =       "2",
+  number =       "3",
+  pages =        "366--377",
+  year =         "1991",
+}
+
+@InCollection{TreHolAhm93,
+  author =       "V. Tresp and J. Hollatz and S. Ahmad",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Network structuring and training using rule-based
+                 knowledge",
+  publisher =    "Morgan Kaufman Publishers",
+  address =      "San Mateo, CA",
+  year =         "1993",
+}
+
+@InProceedings{Tresp-nips93,
+  author =       "V. Tresp and J. Hollatz and S. Ahmad",
+  editor =       NIPS5ed,
+  booktitle =    NIPS5,
+  title =        "Network Structuring and Training Using Rule-based
+                 Knowledge",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1993",
+}
+
+@Article{tresp2001,
+  author =       "V. Tresp",
+  title =        "Scaling Kernel-Based Systems to Large Data Sets",
+  journal =      "Data Mining and Knowledge Discovery",
+  volume =       "5",
+  number =       "3",
+  pages =        "197--211",
+  year =         "2001",
+}
+
+@InCollection{Tresp94,
+  author =       "V. Tresp and S. Ahmad and R. Neuneier",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Training neural networks with deficient data",
+  publisher =    "Morgan Kaufman Publishers",
+  address =      "San Mateo, CA",
+  pages =        "128--135",
+  year =         "1994",
+}
+
+@Article{TRNN:Tsoi94,
+  author =       "A. C. Tsoi and A. Back",
+  title =        "Locally Recurrent Globally Feedforward Networks, {A}
+                 Critical Review of Architectures",
+  journal =      "IEEE Transactions on Neural Networks",
+  volume =       "5",
+  number =       "2",
+  pages =        "229--239",
+  year =         "1994",
+}
+
+@InProceedings{Tseng-1998,
+  author =       "Yuen-Hsien Tseng",
+  booktitle =    "SIGIR '98: Proceedings of the 21st Annual
+                 International ACM SIGIR Conference on Research and
+                 Development in Information Retrieval, August 24-28
+                 1998, Melbourne, Australia",
+  title =        "Multilingual Keyword Extraction for Term Suggestion",
+  publisher =    "ACM",
+  pages =        "377--378",
+  year =         "1998",
+}
+
+@Article{TsochantaridisI2005,
+  author =       "Ioannis Tsochantaridis and Thorsten Joachims and
+                 Thomas Hofmann and Yasemin Altun",
+  title =        "Large Margin Methods for Structured and Interdependent
+                 Output Variables",
+  journal =      "J. Mach. Learn. Res.",
+  volume =       "6",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA, USA",
+  pages =        "1453--1484",
+  year =         "2005",
+  ISSN =         "1533-7928",
+}
+
+@Article{Tsodyks88,
+  author =       "M. V. Tsodyks and M. V. Feigel'man",
+  title =        "The Enhanced Storage Capacity in Neural Networks with
+                 Low Activity Level",
+  journal =      eul,
+  volume =       "6",
+  pages =        "101--105",
+  year =         "1988",
+}
+
+@InProceedings{Tsoi+Pearson91,
+  author =       "A. C. Tsoi and R. A. Pearson",
+  editor =       NIPS3ed,
+  booktitle =    NIPS3,
+  title =        "Comparison of three classification techniques: {CART},
+                 {C4}.5, and multi-layer perceptron",
+  publisher =    "Morgan Kaufmann",
+  address =      "Denver, CO",
+  pages =        "",
+  year =         "1991",
+}
+
+@Book{TSP93,
+  editor =       "A. Weigend and N. Gershenfeld",
+  title =        "Time Series Prediction: Forecasting the future and
+                 understanding the past",
+  publisher =    "Addison-Wesley",
+  year =         "1993",
+}
+
+@InProceedings{Tsuda99,
+  author =       "K. Tsuda",
+  booktitle =    "ICANN'99",
+  title =        "Optimal Hyperplane Classifier based on Entropy Number
+                 Bound",
+  pages =        "419--424",
+  year =         "1999",
+}
+
+@PhdThesis{Turian07thesis,
+  author =       "Joseph Turian",
+  title =        "Constituent Parsing by Classification",
+  school =       "New York University",
+  year =         "2007",
+}
+
+@Article{tzanetakis+cook:2002,
+  author =       "George Tzanetakis and Perry Cook",
+  title =        "Musical Genre Classification of Audio Signals",
+  journal =      "IEEE Transactions on Speech and Audio Processing",
+  volume =       "10",
+  number =       "5",
+  pages =        "293--302",
+  month =        jul,
+  year =         "2002",
+}
+
+@Article{Uberbacher91,
+  author =       "E. C. Uberbacher and R. J. Mural",
+  title =        "Locating protein-coding regions in human {DNA}
+                 sequences by a multiple sensor-neural network
+                 approach",
+  journal =      "Proc. Natl. Acad. Sci. USA",
+  volume =       "88",
+  pages =        "11261--11265",
+  year =         "1991",
+}
+
+@Article{Uhrig91,
+  author =       "R. E. Uhrig",
+  title =        "Potential Applications of Neural Networks to the
+                 Operation of a Nuclear Power Plant",
+  journal =      "Nuclear Safety",
+  volume =       "32",
+  number =       "1",
+  year =         "1991",
+}
+
+@Article{Uhrig94,
+  author =       "R. E. Uhrig",
+  title =        "Artificial Neural Networks in Nuclear Power Plants",
+  journal =      "Nuclear News",
+  volume =       "37",
+  number =       "9",
+  pages =        "38",
+  year =         "1994",
+}
+
+@Article{Utgoff-2002,
+  author =       "Paul E. Utgoff and David J. Stracuzzi",
+  title =        "Many-Layered Learning",
+  journal =      "Neural Computation",
+  volume =       "14",
+  pages =        "2497--2539",
+  year =         "2002",
+}
+
+@Article{Valiant84,
+  author =       "L. G. Valiant",
+  title =        "A Theory of the Learnable",
+  journal =      "Communications of the ACM",
+  volume =       "27",
+  number =       "11",
+  pages =        "1134--1142",
+  year =         "1984",
+}
+
+@InProceedings{VandenBout88,
+  author =       "D. E. Van den Bout and T. K. Miller",
+  booktitle =    icnn,
+  title =        "A Travelling Salesman Objective Function That Works",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "299--303",
+  year =         "1988",
+}
+
+@Article{VandenBout89,
+  author =       "D. E. Van den Bout and T. K. Miller",
+  title =        "Improving the Performance of the Hopfield-Tank Neural
+                 Network Through Normalization and Annealing",
+  journal =      biocyb,
+  volume =       "62",
+  pages =        "129--139",
+  year =         "1989",
+}
+
+@Article{VanDerMaaten08,
+  author =       "Laurens {van der Maaten} and Geoffrey E. Hinton",
+  title =        {Visualizing Data using t-SNE},
+  journal =      jmlr,
+  year =         "2008",
+  keywords =     {dimension-reduction, locality, nearest-neighbors, spectral, visualization},
+  month =        {November},
+  pages =        {2579--2605},
+  url =          {http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf},
+  volume =       {9},
+}
+
+@Book{VanDerVaart+Wellner-1996,
+  author =       "A. W. {van der Vaart} and J. Wellner",
+  title =        "Weak Convergence and Empirical Processes with
+                 applications to Statistics",
+  publisher =    "Springer",
+  address =      "New York",
+  year =         "1996",
+}
+
+@Article{vanHemmen79,
+  author =       "J. L. van Hemmen and R. G. Palmer",
+  title =        "The Replica Method and a Solvable Spin Glass Model",
+  journal =      jpa,
+  volume =       "12",
+  pages =        "563--580",
+  year =         "1979",
+}
+
+@Article{vanHemmen86,
+  author =       "J. L. van Hemmen and R. K{\"u}hn",
+  title =        "Nonlinear Neural Networks",
+  journal =      prl,
+  volume =       "57",
+  pages =        "913--916",
+  year =         "1986",
+}
+
+@Article{vanHemmen90,
+  author =       "J. L. van Hemmen and L. B. Ioffe and R. K{\"u}hn and
+                 M. Vaas",
+  title =        "Increasing the Efficiency of a Neural Network through
+                 Unlearning",
+  journal =      physicaA,
+  volume =       "163",
+  pages =        "386--392",
+  year =         "1990",
+}
+
+% HUGO: Haven't found what A. stands for...
+@Article{VapnikV63,
+  author =       "Vladimir Vapnik and A. Lerner", 
+  title =        "Pattern Recognition using Generalized Portrait Method",
+  journal =      "Automation and Remote Control",
+  volume =       "24",
+  year =         "1963",
+}
+
+@Article{Vapnik71,
+  author =       "V. N. Vapnik and A. Y. Chervonenkis",
+  title =        "On the Uniform Convergence of Relative Frequencies of
+                 Events to Their Probabilities",
+  journal =      tprobapp,
+  volume =       "16",
+  pages =        "264--280",
+  year =         "1971",
+}
+
+@Book{Vapnik82,
+  author =       "V. N. Vapnik",
+  title =        "Estimation of Dependences Based on Empirical Data",
+  publisher =    "Springer-Verlag",
+  address =      "Berlin",
+  year =         "1982",
+}
+
+@Article{Vapnik93,
+  author =       "V. Vapnik and L. Bottou",
+  title =        "Local algorithms for pattern recognition and
+                 dependencies estimation",
+  journal =      nc,
+  volume =       "5",
+  number =       "6",
+  pages =        "893--909",
+  year =         "1993",
+}
+
+@Book{Vapnik95,
+  author =       "V. N. Vapnik",
+  title =        "The Nature of Statistical Learning Theory",
+  publisher =    "Springer",
+  address =      "New York",
+  year =         "1995",
+}
+
+@Book{Vapnik98,
+  author =       "Vladimir Vapnik",
+  title =        "Statistical Learning Theory",
+  publisher =    "Wiley, Lecture Notes in Economics and Mathematical
+                 Systems, volume 454",
+  year =         "1998",
+}
+
+@InCollection{variational99,
+  author =       "M. I. Jordan and Z. Ghahramani and T. Jaakkola and L.
+                 Saul",
+  editor =       "M. I. Jordan",
+  booktitle =    "Learning in Graphical Models",
+  title =        "An introduction to variational methods in graphical
+                 models",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "1999",
+}
+
+@InProceedings{Venka+PC-2004,
+  author =       "Shailaja Venkatsubramanyan and Jose Perez-Carballo",
+  booktitle =    "Second ACL Workshop on Multiword Expressions",
+  title =        "Multiword Expression Filtering for Building Knowledge
+                 Maps",
+  pages =        "40--47",
+  year =         "2004",
+}
+
+@InProceedings{Verbeek-2004,
+  author =       "Jakob J. Verbeek and Sam T. Roweis and Nikos Vlassis",
+  editor =       NIPS16ed,
+  booktitle =    NIPS16,
+  title =        "Non-linear {CCA} and {PCA} by Alignment of Local
+                 Models",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2004",
+  keywords =     "dimensionality reduction, spectral methods, mixture
+                 density, CCA, PCA",
+}
+
+@InProceedings{Veronis1990,
+  author =       "Jean Veronis and Nancy Ide",
+  booktitle =    "COLING'90",
+  title =        "Word Sense Disambiguation with Very Large Neural
+                 Networks Extracted from Machine Readable Dictionaries",
+  year =         "1990",
+}
+
+@Misc{Veronis98,
+  author =       "Jean Veronis",
+  title =        "A study of polysemy judgements and inter-annotator
+                 agreement",
+  year =         "1998",
+  URL =          "citeseer.nj.nec.com/veronis98study.html",
+  text =         "Veronis, J., 1998. A study of polysemy judgements and
+                 inter-annotator agreement. In Programme and advanced
+                 papers of the Senseval workshop. Herstmonceux Castle,
+                 England.",
+}
+
+@InProceedings{Vilalta+al-1997,
+  author =       "Ricardo Vilalta and Gunnar Blix and Larry Rendell",
+  booktitle =    ECML97,
+  title =        "Global Data Analysis and the Fragmentation Problem in
+                 Decision Tree Induction",
+  publisher =    "Springer-Verlag",
+  pages =        "312--327",
+  year =         "1997",
+}
+
+@InProceedings{Vincent-Bengio-2003-short,
+  author =       "Pascal Vincent and Yoshua Bengio",
+  booktitle =    NIPS15,
+  title =        "Manifold Parzen Windows",
+  publisher =    "MIT Press",
+  year =         "2003",
+}
+
+@TechReport{Vincent-TR1316-small,
+  author =       "P. Vincent and H. Larochelle and Y. Bengio and P.-A.
+                 Manzagol",
+  title =        "Extracting and Composing Robust Features with
+                 Denoising Autoencoders",
+  number =       "1316",
+  institution =  "Universit\'e de Montr\'eal, dept. IRO",
+  year =         "2008",
+}
+
+@Article{Vincent2001,
+  author =       "P. Vincent and Y. Bengio",
+  title =        "Kernel Matching Pursuit",
+  journal =      "Machine Learning",
+  volume =       "48",
+  number =       "",
+  pages =        "165--187",
+  year =         "2002",
+}
+
+@InProceedings{Vincent2002,
+  author =       "P. Vincent and Y. Bengio",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "{K}-Local Hyperplane and Convex Distance Nearest
+                 Neighbor Algorithms",
+  publisher =    "{MIT} Press",
+  address =      "Cambridge, MA",
+  pages =        "985--992",
+  year =         "2002",
+}
+
+@InProceedings{VincentPLarochelleH2008-small,
+  author =       "Pascal Vincent and Hugo Larochelle and Yoshua Bengio
+                 and Pierre-Antoine Manzagol",
+  booktitle =    "ICML 2008",
+  title =        "Extracting and Composing Robust Features with
+                 Denoising Autoencoders",
+  year =         "2008",
+}
+
+@InProceedings{VincentPLarochelleH2008-short,
+  author =       "Pascal Vincent and Hugo Larochelle and Yoshua Bengio
+                 and Pierre-Antoine Manzagol",
+  booktitle =    "Int. Conf. Mach. Learn.",
+  title =        "Extracting and Composing Robust Features with
+                 Denoising Autoencoders",
+  year =         "2008",
+  pages =        "1096--1103"
+}
+
+
+@InProceedings{vincent:icml08,
+   author =     "Pascal Vincent and Hugo Larochelle and Yoshua Bengio and {Pierre-Antoine Manzagol}",
+   title =      "Extracting and composing robust features with denoising autoencoders",
+   booktitle =  "Proceedings of the 25th Annual International Conference on Machine Learning (ICML 2008)",
+   location =   "Helsinki, Finland",
+   editor =     "Andrew McCallum and Sam Roweis",
+   publisher =  "Omnipress",
+   year =       "2008",
+   pages =      "1096--1103",
+}
+   %url =        "http://icml2008.cs.helsinki.fi/papers/592.pdf",
+
+@InProceedings{VincentPLarochelleH2008-very-small,
+  author =       "P. Vincent and H. Larochelle and Y. Bengio and P.-A.
+                 Manzagol",
+  booktitle =    "ICML 2008",
+  title =        "Extracting and Composing Robust Features with
+                 Denoising Autoencoders",
+  year =         "2008",
+}
+
+@Article{Viterbi67,
+  author =       "A. Viterbi",
+  title =        "Error bounds for convolutional codes and an
+                 asymptotically optimum decoding algorithm",
+  journal =      ieeeit,
+  pages =        "260--269",
+  year =         "1967",
+}
+
+@InProceedings{Vlachos-2002,
+  author =       "Michail Vlachos and Carlotta Domeniconi and Dimitrios
+                 Gunopulos and George Kollios and Nick Koudas",
+  booktitle =    "Proc. of 8th SIGKDD",
+  title =        "Non-Linear Dimensionality Reduction Techniques for
+                 Classification and Visualization",
+  address =      "Edmonton, Canada",
+  year =         "2002",
+  URL =          "citeseer.ist.psu.edu/573153.html",
+}
+
+@Article{vogl-88,
+  author =       "T. Vogl and J. Mangis and J. Rigler and W. Zink and D.
+                 Alkon",
+  title =        "accelerating convergence of the back-propagation
+                 method",
+  journal =      "Biological Cybernetics",
+  volume =       "59",
+  pages =        "257--263",
+  year =         "1988",
+}
+
+@Article{Vogl88,
+  author =       "T. P. Vogl and J. K. Mangis and A. K. Rigler and W. T.
+                 Zink and D. L. Alkon",
+  title =        "Accelerating the Convergence of the Back-Propagation
+                 Method",
+  journal =      biocyb,
+  volume =       "59",
+  pages =        "257--263",
+  year =         "1988",
+}
+
+@Book{Volterra,
+  author =       "V. Volterra",
+  title =        "Theory of Functionals and of Integrals and
+                 Integro-Differential Equations",
+  publisher =    "Dover",
+  address =      "New York",
+  year =         "1959",
+}
+
+@Article{vonderMalsburg73,
+  author =       "Ch. von der Malsburg",
+  title =        "Self-Organization of Orientation Sensitive Cells in
+                 the Striate Cortex",
+  journal =      kyb,
+  volume =       "14",
+  year =         "1973",
+}
+
+@Article{vonderMalsburg82,
+  author =       "Ch. von der Malsburg and J. D. Cowan",
+  title =        "Outline of a Theory for the Ontogenesis of
+                 Iso-Orientation Domains in Visual Cortex",
+  journal =      biocyb,
+  volume =       "45",
+  pages =        "49--56",
+  year =         "1982",
+}
+
+@InProceedings{vonLehman88,
+  author =       "A. von Lehman and E. G. Paek and P. F. Liao and A.
+                 Marrakchi and J. S. Patel",
+  booktitle =    icnn,
+  title =        "Factors Influencing Learning by Back-Propagation",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "335--341",
+  year =         "1988",
+}
+
+@Article{vonLuxurg07,
+  author =       "U. von Luxburg",
+  title =        "A Tutorial on Spectral Clustering.",
+  journal =      "Statistics and Computing",
+  volume = 	 "17(4)",
+  pages =        "395-416",
+  year =         "2007",
+}
+
+@InCollection{vonNeumann56,
+  author =       "J. von Neumann",
+  editor =       "C. E. Shannon and J. McCarthy",
+  booktitle =    "Automata Studies",
+  title =        "Probabilistic Logics and the Synthesis of Reliable
+                 Organisms from Unreliable Components",
+  publisher =    "Princeton University Press",
+  address =      "Princeton",
+  pages =        "43--98",
+  year =         "1956",
+}
+
+@Article{Wagner87,
+  author =       "K. Wagner and D. Psaltis",
+  title =        "Multilayer Optical Learning Networks",
+  journal =      applopt,
+  volume =       "26",
+  pages =        "5061--5076",
+  year =         "1987",
+}
+
+@InCollection{Wahba82,
+  author =       "G. Wahba",
+  editor =       "Gupta and Berger",
+  booktitle =    "Statistical Decision Theory and Related Topics III",
+  title =        "Constrained regularization for ill-posed linear
+                 operator equations, with applications in meteorology
+                 and medecine",
+  publisher =    "Academic Press",
+  year =         "1982",
+}
+
+@InProceedings{Wahba90,
+  author =       "G. Wahba",
+  booktitle =    "CBMS-NSF Regional Conference Series in Applied
+                 Mathematics",
+  title =        "Spline models for observational data",
+  volume =       "59",
+  publisher =    "Society for Industrial and Applied Mathematics
+                 (SIAM)",
+  address =      "Philadelphia, PA",
+  year =         "1990",
+}
+
+@Article{Waibel89a,
+  author =       "A. Waibel",
+  title =        "Modular Construction of Time-Delay Neural Networks for
+                 Speech Recognition",
+  journal =      nc,
+  volume =       "1",
+  pages =        "39--46",
+  year =         "1989",
+}
+
+@Article{Waibel89b,
+  author =       "A. Waibel and T. Hanazawa and G. E. Hinton and K.
+                 Shikano and K. Lang",
+  title =        "Phoneme Recognition Using Time-Delay Neural Networks",
+  journal =      ieeetassp,
+  volume =       "37",
+  pages =        "328--339",
+  year =         "1989",
+}
+
+@Article{Waibel89c,
+  author =       "A. Waibel and H Sawai and K. Shikano",
+  title =        "Modularity and Scaling in Large Phonemic Neural
+                 Networks",
+  journal =      ieeetassp,
+  volume =       "37",
+  pages =        "1888--1898",
+  year =         "1989",
+}
+
+@Article{Wallace+Boulton-1968,
+  author =       "C. S. Wallace and D. M. Boulton",
+  title =        "An information measure for classification",
+  journal =      "Computer Journal",
+  volume =       "11",
+  number =       "2",
+  pages =        "185--194",
+  year =         "1968",
+}
+
+@InCollection{Wan93,
+  author =       "Wan E. A.",
+  editor =       "A. S. Weigend and N. A. Gershenfeld",
+  booktitle =    "Time Series Prediction: Forecasting the Future and
+                 Understanding the Past",
+  title =        "Time series prediction by using a connectionist
+                 network with internal delay lines",
+  publisher =    "Addison-Wesley",
+  pages =        "195--217",
+  year =         "1993",
+}
+
+@InCollection{Wan93a,
+  author =       "E. A. Wan",
+  editor =       "A. Weigend and N. Gershenfeld",
+  booktitle =    "Predicting the future and understanding the past",
+  title =        "Time Series Prediction by Using a Connectionist
+                 Network with Internal Delay Lines",
+  publisher =    "Addison-Wesley",
+  address =      "Redwood City, CA",
+  pages =        "175--193",
+  year =         "1993",
+}
+
+@InProceedings{Wang-ijcnn91,
+  author =       "S. D. Wang and C. H. Hsu",
+  booktitle =    ijcnn,
+  title =        "Terminal Attractor Learning Algorithms for
+                 Backpropagation Neural Networks",
+  publisher =    "IEEE Press",
+  address =      "Singapore",
+  pages =        "183--189",
+  month =        nov,
+  year =         "1991",
+}
+
+@INPROCEEDINGS{WangC1994,
+    author = {Changfeng Wang and Santosh S. Venkatesh and J. Stephen Judd},
+    title = {Optimal stopping and effective machine complexity in learning},
+    editor = NIPS6ed,
+    booktitle = NIPS6,
+    year = {1994},
+    pages = {303--310},
+    publisher = {Morgan Kaufmann}
+}
+
+@inproceedings{wangetal08,
+author = "Wang, Q. and Lin, D. and Schuurmans, D.",
+title = "Semi-supervised convex training for dependency parsing",
+booktitle = "Proceedings of the Forty-sixth Annual Conference of the 
+Association for Computational Linguistics: Human Language Technologies (ACL)",
+year = 2008,
+note = "Acceptance rate 25\%; Wang a trainee"
+}
+
+@inproceedings{wangetal07,
+author = "Wang, T. and Lizotte, D. and Bowling, M. and Schuurmans, D.",
+title = "Stable dual dynamic programming",
+editor =       NIPS20ed,
+booktitle =    NIPS20,
+year = 2007,
+note = "Acceptance rate 22\%; Wang and Lizotte trainees"
+}
+
+
+@Misc{Wang02,
+  author =       "L. Wang and K. Luk Chan",
+  howpublished =    "6th kernel machines workshop, in conjunction with Neural Information Processing Systems (NIPS)",
+  title =        "Learning Kernel Parameters by using Class Separability
+                 Measure",
+  year =         "2002",
+  url =          "http://users.rsise.anu.edu.au/~wanglei/#Publication",
+}
+
+@Article{Wang89,
+  author =       "H. Wang and J. Wu and P. Tang",
+  title =        "Superfamily expands",
+  journal =      "Nature",
+  volume =       "337",
+  pages =        "514",
+  year =         "1989",
+}
+
+@InProceedings{WangHarper2002,
+  author =       "Wen Wang and Mary P. Harper",
+  booktitle =    "EMNLP '02: Proceedings of the ACL-02 conference on
+                 Empirical methods in natural language processing",
+  title =        "The Super{ARV} language model: investigating the
+                 effectiveness of tightly integrating multiple knowledge
+                 sources",
+  publisher =    "Association for Computational Linguistics",
+  address =      "Morristown, NJ, USA",
+  pages =        "238--247",
+  year =         "2002",
+}
+
+@Article{Warmuth95,
+  author =       "Sally Floyd and Manfred Warmuth",
+  title =        "Sample Compression, Learnability, and the
+                 Vapnik-Chervonenkis Dimension",
+  journal =      "Machine Learning",
+  volume =       "21",
+  number =       "3",
+  pages =        "269--304",
+  year =         "1995",
+}
+
+@Book{Wasserman-2004,
+  author =       "Larry Wasserman",
+  title =        "All of Statistics - A Concise Course in Statistical Inference",
+  publisher =    "Springer",
+  year =         "2004",
+}
+
+@PhdThesis{Watkins-PhD,
+  author =       "C. J. C. H. Watkins",
+  title =        "Learning from Delayed Rewards",
+  school =       "Cambridge University",
+  address =      "Cambridge, England",
+  year =         "1989",
+}
+
+@InProceedings{Watrous87,
+  author =       "R. L. Watrous",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "Learning Algorithms for Connectionist Networks:
+                 Applied Gradient Methods of Nonlinear Optimization",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "619--627",
+  year =         "1987",
+}
+
+@TechReport{Watrous89,
+  author =       "R. L. Watrous",
+  title =        "Context-modulated discrimination of similar vowels
+                 using second-order connectionist networks",
+  number =       "{CRG-TR}-89-5",
+  institution =  "University of Toronto",
+  year =         "1989",
+}
+
+@Article{Watrous-nc92,
+  author =       "R. L. Watrous and G. M. Kuhn",
+  title =        "Induction of Finite-State Languages Using Second-Order
+                 Recurrent Networks",
+  journal =      nc,
+  volume =       "4",
+  number =       "3",
+  pages =        "406--414",
+  year =         "1992",
+}
+
+@Article{Watson64,
+  author =       "G. S. Watson",
+  title =        "Smooth regression analysis",
+  journal =      "Sankhya - The Indian Journal of Statistics",
+  volume =       "26",
+  pages =        "359--372",
+  year =         "1964",
+}
+
+@inproceedings{Weber-2000,
+ author = {Markus Weber and Max Welling and Pietro Perona},
+ title = {Unsupervised Learning of Models for Recognition},
+ booktitle = {Proc. 6th Europ. Conf. Comp. Vis., ECCV2000}, 
+ address = {Dublin},
+ year = 2000,
+ pages     = {18-32},
+ url       = {http://link.springer.de/link/service/series/0558/bibs/1842/18420018.htm},
+}
+
+@Book{Webster88,
+  editor =       "Webster",
+  title =        "Webster's Ninth New Collegiate Dictionary",
+  publisher =    "Merriam-Webster",
+  address =      "Springfield",
+  year =         "1988",
+}
+
+@Book{Wegener87,
+  author =       "Ingo Wegener",
+  title =        "The Complexity of Boolean Functions",
+  publisher =    "John Wiley \& Sons",
+  year =         "1987",
+}
+
+@InCollection{Weigend93,
+  author =       "N. A. Gershenfeld and A. S. Weigend",
+  editor =       "A. Weigend and N. Gershenfeld",
+  booktitle =    "Predicting the future and understanding the past",
+  title =        "The Future of Time Series: Learning and
+                 Understanding",
+  publisher =    "Addison-Wesley",
+  address =      "Redwood City, CA",
+  pages =        "1--70",
+  year =         "1993",
+}
+
+@Article{Weigend95,
+  author =       "A. S. Weigend and A. N. Srivastava",
+  title =        "Predicting Conditional Probability Distributions: {A}
+                 Connectionist Approach",
+  journal =      "International Journal of Neural Systems",
+  volume =       "6",
+  year =         "1995",
+}
+
+@InProceedings{Weinberger+Saul-06,
+  author =       "K. Q. Weinberger and L. K. Saul",
+  booktitle =    "Proceedings of the National Conference on Artificial
+                 Intelligence (AAAI)",
+  title =        "An Introduction to Nonlinear Dimensionality Reduction
+                 by Maximum Variance Unfolding",
+  address =      "Boston, MA",
+  year =         "2006",
+}
+
+@InProceedings{weinberger-learningkernel-04,
+  author =       "Kilian Q. Weinberger and Fei Sha and Lawrence K. Saul",
+  booktitle =    ICML04,
+  editor =       ICML04ed,
+  publisher =    ICML04publ,
+  title =        "Learning a kernel matrix for nonlinear dimensionality
+                 reduction",
+  address =      "Banff, Canada",
+  pages =        "839--846",
+  year =         "2004",
+}
+
+@InProceedings{Weinberger04a,
+  author =       "K. Q. Weinberger and L. K. Saul",
+  booktitle =    cvpr04,
+  title =        "Unsupervised Learning of Image Manifolds by
+                 Semidefinite Programming",
+  volume =       "2",
+  address =      "Washington D.C.",
+  pages =        "988--995",
+  year =         "2004",
+}
+
+@Article{weinberger95,
+  author =       "M. J. Weinberger and J. Rissanen and M. Feder",
+  title =        "A universal finite memory source",
+  journal =      "IEEE Transactions on Information Theory",
+  pages =        "656--664",
+  year =         "1983",
+}
+
+@InCollection{WeinbergerK2006,
+  author =       "Kilian Q. Weinberger and John Blitzer and Lawrence K. Saul",
+  editor =       NIPS18ed,
+  booktitle =    NIPS18,
+  title =        "Distance Metric Learning for Large Margin Nearest
+                 Neighbor Classification",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "1473--1480",
+  year =         "2006",
+}
+
+@conference{WeinbergerK2007,
+  author = {Kilian Q. Weinberger and Gerald Tesauro},
+  title = {Metric Learning for Kernel Regression},
+  booktitle = {Proc. of the 11 thInternational Conference on Artificial Intelligence and Statistics},
+  year = {2007},
+}
+  %url = {http://www.stat.umn.edu/~aistat/proceedings/data/papers/077.pdf}
+
+@Article{Weingartner,
+  author =       "H. M. Weingartner and D. N. Ness",
+  title =        "Methods for the Solution of the Multi-Dimensional 0/1
+                 Knapsack Problem",
+  journal =      "Operations Research",
+  volume =       "15",
+  pages =        "83--103",
+  year =         "1967",
+}
+
+@Article{Weisbuch85,
+  author =       "G. Weisbuch and F. Fogelman-Souli\'e",
+  title =        "Scaling Laws for the Attractors of Hopfield Networks",
+  journal =      jppl,
+  volume =       "46",
+  pages =        "623--630",
+  year =         "1985",
+}
+
+@InProceedings{Weiss-99,
+  author =       "Yair Weiss",
+  booktitle =    ICCV99,
+  title =        "Segmentation using eigenvectors: a unifying view",
+  pages =        "975--982",
+  year =         "1999",
+}
+
+@Article{Weiss2000,
+  author =       "Yair Weiss",
+  title =        "Correctness of local probability propagation in
+                 graphical models with loops",
+  journal =      "Neural Computation",
+  volume =       "12",
+  pages =        "1--41",
+  year =         "2000",
+}
+
+@Book{Weiss90,
+  author =       "S. M. Weiss and C. A. Kulikowski",
+  title =        "Computer Systems That Learn",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1990",
+}
+
+@InProceedings{Welling05,
+  author =       "Max Welling and Michal Rosen-Zvi and Geoffrey E. Hinton",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Exponential Family Harmoniums with an Application to
+                 Information Retrieval",
+  volume =       "17",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2005",
+}
+
+@InProceedings{Welling05-small,
+  author =       "M. Welling and M. Rosen-Zvi and G. E. Hinton",
+  booktitle =    "NIPS 17",
+  title =        "Exponential Family Harmoniums with an Application to
+                 Information Retrieval",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  year =         "2005",
+}
+
+@InProceedings{Welling2003,
+  author =       "Max Welling and Richard Zemel and Geoffrey E. Hinton",
+  editor =       NIPS15ed,
+  booktitle =    NIPS15,
+  title =        "Self-Supervised Boosting",
+  publisher =    "{MIT} Press",
+  pages =        "665--672",
+  year =         "2003",
+}
+
+@InProceedings{WellingM2002,
+  author =       "Max Welling and Geoffrey E. Hinton",
+  booktitle =    "ICANN '02: Proceedings of the International Conference
+                 on Artificial Neural Networks",
+  title =        "A New Learning Algorithm for Mean Field {Boltzmann}
+                 Machines",
+  publisher =    "Springer-Verlag",
+  address =      "London, UK",
+  pages =        "351--357",
+  year =         "2002",
+  ISBN =         "3-540-44074-7",
+}
+
+@InProceedings{WellingNIPS17,
+  author =       "Max Welling and Michal Rosen-Zvi and Geoffrey E. Hinton",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "Exponential Family Harmoniums with an Application to
+                 Information Retrieval",
+  publisher =    "{MIT} Press",
+  address =      {Cambridge, MA},
+  pages =        "1481--1488",
+  year =         "2005",
+}
+
+@InProceedings{WellingNIPS17-small,
+  author =       "M. Welling and M. Rosen-Zvi and G. E. Hinton",
+  booktitle =    "NIPS 17",
+  title =        "Exponential Family Harmoniums with an Application to
+                 Information Retrieval",
+  publisher =    "{MIT} Press",
+  year =         "2005",
+}
+
+
+@InProceedings{WellingUAI2009,
+  author =       "Max Welling",
+  booktitle =    UAI09,
+  title =        "Herding Dynamic Weights for Partially Observed Random Field Models",
+  publisher =    "Morgan Kaufmann",
+  year =         "2009",
+}
+
+@InProceedings{WellingICML2009,
+  author =       "Max Welling",
+  booktitle =    ICML09,
+  editor =       ICML09ed,
+  publisher =    ICML09publ,
+  title =        {Herding Dynamic Weights to Learn},
+  year =         "2009",
+}
+
+@InProceedings{Werbos-icnn88,
+  author =       "P. J. Werbos",
+  booktitle =    icnn,
+  title =        "Back-Propagation: Past and Future",
+  publisher =    "IEEE Press",
+  address =      "New York, NY",
+  year =         "1988",
+  OPTpages =     "343--353",
+}
+
+@PhdThesis{Werbos74,
+  author =       "P. Werbos",
+  title =        "Beyond Regression: New Tools for Prediction and
+                 Analysis in the Behavioral Sciences",
+  school =       "Harvard University",
+  year =         "1974",
+}
+
+@Article{Werbos87,
+  author =       "P. J. Werbos",
+  title =        "Building and Understanding Adaptive Systems: {A}
+                 Statistical/Numerical Approach to Factory Automation
+                 and Brain Research",
+  journal =      ieeesmc,
+  volume =       "17",
+  pages =        "7--20",
+  year =         "1987",
+}
+
+@Article{Werbos88,
+  author =       "P. J. Werbos",
+  title =        "Generalization of Backpropagation with Application to
+                 a Recurrent Gas Market Model",
+  journal =      nn,
+  volume =       "1",
+  pages =        "339--356",
+  year =         "1988",
+}
+
+@InProceedings{wermuth+cox92,
+  author =       "N. Wermuth and D. R. Cox",
+  booktitle =    "Proceedings of the 10th Symposium on Computational
+                 Statistics",
+  title =        "Graphical models for dependencies and associations",
+  volume =       "1",
+  address =      "Physica, Heidelberg",
+  pages =        "235--249",
+  year =         "1992",
+}
+
+@Article{wermuth+lauritzen90,
+  author =       "N. Wermuth and S. L. Lauritzen",
+  title =        "On substantive research hypotheses, conditional
+                 independence graphs and graphical chain models",
+  journal =      "J. Roy. Statist. Soc. Ser. B",
+  volume =       "52",
+  pages =        "21--72",
+  year =         "1990",
+}
+
+@Article{Wessels-trnn92,
+  author =       "L. F. A. Wessels and E. Barnad",
+  title =        "Avoiding False Local Minima by Proper Initialization
+                 of Connections",
+  journal =      ieeetrnn,
+  volume =       "3",
+  number =       "6",
+  pages =        "899--905",
+  year =         "1992",
+}
+
+@Article{weston03zeronorm,
+  author =       "Jason Weston and Andr\'e Elisseeff and Bernhard
+                 Sch{\"o}lkopf and Mike Tipping",
+  title =        "Use of the zero norm with linear models and kernel
+                 methods",
+  journal =      jmlr,
+  volume =       "3",
+  publisher =    "MIT Press",
+  pages =        "1439--1461",
+  year =         "2003",
+  ISSN =         "1533-7928",
+}
+
+@InProceedings{weston99density,
+  author =       "J. Weston and A. Gammerman and M. Stitson and V.
+                 Vapnik and V. Vovk and C. Watkins",
+  editor =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
+  booktitle =    "Advances in Kernel Methods --- Support Vector
+                 Learning",
+  title =        "Density estimation using support vector machines",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "293--306",
+  year =         "1999",
+}
+
+@InProceedings{WestonJ2008,
+  author =       "Jason Weston and {Fr\'ed\'eric} Ratle and Ronan
+                 Collobert",
+  booktitle =    ICML08,
+  editor =       ICML08ed,
+  publisher =    ICML08publ,
+  title =        "Deep Learning via Semi-Supervised Embedding",
+  year =         "2008",
+  isbn =         {978-1-60558-205-4},
+  pages =        {1168--1175},
+  location =     {Helsinki, Finland},
+  doi =          {http://doi.acm.org/10.1145/1390156.1390303},
+  address =      {New York, NY, USA},
+}
+  %url =          "http://www.kyb.tuebingen.mpg.de/bs/people/weston/papers/deep-embed.pdf",
+
+@InProceedings{WestonJ2008-small,
+  author =       "J. Weston and F. Ratle and R. Collobert",
+  booktitle =    "ICML 2008",
+  title =        "Deep Learning via Semi-Supervised Embedding",
+  year =         "2008",
+}
+
+@InProceedings{WestonJ2008-short,
+  author =       "J. Weston and F. Ratle and R. Collobert",
+  booktitle =    "Int. Conf. Mach. Learn. 2008",
+  title =        "Deep Learning via Semi-Supervised Embedding",
+  year =         "2008",
+  pages = {1168--1175},
+}
+
+@InProceedings{MobahiCollobertWestonICML2009,
+  author =    {Hossein Mobahi and Ronan Collobert and Jason Weston},
+  title =     {Deep Learning from Temporal Coherence in Video},
+  booktitle = {Proceedings of the 26th International Conference on Machine Learning},
+  pages =     {737--744},
+  year =      2009,
+  editor =    {L\'{e}on Bottou and Michael Littman},
+  address =   {Montreal},
+  month =     {June},
+  publisher = {Omnipress}
+}
+
+@Article{White89,
+  author =       "H. White",
+  title =        "Learning in Artificial Neural Networks: {A}
+                 Statistical Perspective",
+  journal =      "Neural Computation",
+  volume =       "1",
+  type =         "Review",
+  number =       "4",
+  pages =        "425--464",
+  year =         "1989",
+}
+
+@Article{White90,
+  author =       "H. White",
+  title =        "Connectionist nonparametric regression: {Multilayer}
+                 feedforward networks can learn arbitrary mappings",
+  journal =      "Neural Networks",
+  volume =       "3",
+  number =       "5",
+  publisher =    "Pergamon Press Ltd., Inc.",
+  pages =        "535--549",
+  year =         "1990",
+}
+
+@InProceedings{White91,
+  author =       "H. White",
+  booktitle =    "?",
+  title =        "An overview of representation and convergence results
+                 for multilayer feedforward networks",
+  pages =        "",
+  year =         "1991",
+}
+
+@InProceedings{Whitley89,
+  author =       "D. Whitley and T. Hanson",
+  editor =       "J. D. Schaffer",
+  booktitle =    "Proceedings of the Third International Conference on
+                 Genetic Algorithms",
+  title =        "Optimizing Neural Networks Using Faster, More Accurate
+                 Genetic Search",
+  publisher =    "Morgan Kaufmann, San Mateo",
+  address =      "Arlington 1989",
+  pages =        "391--396",
+  year =         "1989",
+}
+
+@Book{whittaker90,
+  author =       "J. Whittaker",
+  title =        "Graphical Models in Applied Multivariate Statistics",
+  publisher =    "Wiley, Chichester",
+  year =         "1990",
+}
+
+@InCollection{Widrow60,
+  author =       "B. Widrow and M. E. Hoff",
+  booktitle =    "1960 IRE WESCON Convention Record",
+  title =        "Adaptive Switching Circuits",
+  volume =       "4",
+  publisher =    "IRE",
+  address =      "New York",
+  pages =        "96--104",
+  year =         "1960",
+}
+
+@InProceedings{Widrow62,
+  author =       "B. Widrow",
+  editor =       "M. C. Yovits and G. T. Jacobi and G. D. Goldstein",
+  booktitle =    "Self-Organizing Systems 1962",
+  title =        "Generalization and Information Storage in Networks of
+                 Adaline ``Neurons''",
+  publisher =    "Spartan, Washington",
+  address =      "Chicago 1962",
+  pages =        "435--461",
+  year =         "1962",
+}
+
+@Article{Widrow73,
+  author =       "B. Widrow and N. K. Gupta and S. Maitra",
+  title =        "Punish/Reward: Learning with a Critic in Adaptive
+                 Threshold Systems",
+  journal =      ieeesmc,
+  volume =       "3",
+  pages =        "455--465",
+  year =         "1973",
+}
+
+@Book{Wiener48,
+  author =       "N. Wiener",
+  title =        "Cybernetics, or Control and Communication in the
+                 Animal and the Machine",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1948",
+}
+
+@Book{Wiener49,
+  author =       "N. Wiener",
+  title =        "The Extrapolation, Interpolation and Smoothing of
+                 Stationary Time Series with Engineering Applications",
+  publisher =    "Wiley",
+  address =      "New York",
+  year =         "1949",
+}
+
+@Article{Wilbur+Lipman83,
+  author =       "W. J. Wilbur and D. J. Lipman",
+  title =        "Rapid similarity searches of nucleic acids and protein
+                 data banks",
+  journal =      "Proc. Natl. Acad. Sci. USA",
+  volume =       "80",
+  pages =        "726--730",
+  year =         "1983",
+}
+
+@TechReport{Wilks1996,
+  author =       "Yorick Wilks and Mark Stevenson",
+  title =        "The grammar of sense: Is word sense tagging much more
+                 than part-of-speech tagging?",
+  institution =  "University of Sheffield",
+  year =         "1996",
+}
+
+@Article{Williams+Barclay88,
+  author =       "A. F. Williams and A. N. Barclay",
+  title =        "The immunoglobulin superfamily domains for cell
+                 surface recognition",
+  journal =      "Annual Review of Immunology",
+  volume =       "6",
+  pages =        "381--405",
+  year =         "1988",
+}
+
+@InProceedings{Williams+Rasmussen-nips8,
+  author =       "C. K. I. Williams and C. E. Rasmussen",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Gaussian Processes for Regression",
+  publisher =    "MIT Press, Cambridge, MA",
+  pages =        "514--520",
+  year =         "1996",
+}
+
+@InProceedings{Williams+Seeger-2000,
+  author =       "C. K. I. Williams and M. Seeger",
+  booktitle =    "Proceedings of the Seventeenth International
+                 Conference on Machine Learning",
+  title =        "The Effect of the Input Density Distribution on
+                 Kernel-based Classifiers",
+  publisher =    "Morgan Kaufmann",
+  year =         "2000",
+}
+
+@InProceedings{Williams+Seeger-2001,
+  author =       "Christopher K. I. Williams and Matthias Seeger",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "Using the {Nystr{\"o}m} Method to Speed Up Kernel
+                 Machines",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "682--688",
+  year =         "2001",
+}
+
+@InProceedings{Williams2001,
+  author =       "C. K. I. Williams",
+  editor =       NIPS13ed,
+  booktitle =    NIPS13,
+  title =        "On a Connection between Kernel {PCA} and Metric
+                 Multidimensional Scaling",
+  publisher =    "{MIT} Press",
+  pages =        "675--681",
+  year =         "2001",
+}
+
+@InProceedings{Williams87,
+  author =       "R. J. Williams",
+  editor =       "M. Caudill and C. Butler",
+  booktitle =    icnn,
+  title =        "A Class of Gradient-Estimating Algorithms for
+                 Reinforcement Learning in Neural Networks",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1987",
+  pages =        "601--608",
+  year =         "1987",
+}
+
+@InProceedings{Williams88a,
+  author =       "R. J. Williams",
+  booktitle =    icnn,
+  title =        "On the Use of Back-Propagation in Associative
+                 Reinforcement Learning",
+  volume =       "1",
+  publisher =    "IEEE, New York",
+  address =      "San Diego 1988",
+  pages =        "263--270",
+  year =         "1988",
+}
+
+@TechReport{Williams88b,
+  author =       "R. J. Williams",
+  title =        "Towards a Theory of Reinforcement-Learning
+                 Connectionist Systems",
+  number =       "NU--CCS--88--3",
+  institution =  "College of Computer Science, Northeastern University",
+  address =      "Boston, MA",
+  year =         "1988",
+}
+
+@InProceedings{Williams89a,
+  author =       "R. J. Williams and J. Peng",
+  booktitle =    ijcnn,
+  title =        "Reinforcement Learning Algorithms As Function
+                 Optimizers",
+  volume =       "2",
+  publisher =    "IEEE, New York",
+  address =      "Washington 1989",
+  pages =        "89--95",
+  year =         "1989",
+}
+
+@Article{Williams89b,
+  author =       "R. J. Williams and D. Zipser",
+  title =        "A Learning Algorithm for Continually Running Fully
+                 Recurrent Neural Networks",
+  journal =      nc,
+  volume =       "1",
+  pages =        "270--280",
+  year =         "1989",
+}
+
+@Article{Williams89c,
+  author =       "R. J. Williams and D. Zipser",
+  title =        "Experimental Analysis of the Real-Time Recurrent
+                 Learning Algorithm",
+  journal =      connsci,
+  volume =       "1",
+  pages =        "87--111",
+  year =         "1989",
+}
+
+@InProceedings{Williams93,
+  author =       "William Evans and Sridhar Rajagopalan and Umesh
+                 Vazirani",
+  booktitle =    "Proceedings of the 6th Annual Conference on
+                 Computational Learning Theory",
+  title =        "Choosing a Reliable Hypothesis",
+  publisher =    "ACM Press",
+  address =      "Santa Cruz, CA, USA",
+  pages =        "269--276",
+  month =        jul,
+  year =         "1993",
+  ISBN =         "0-89791-611-5",
+}
+
+@InProceedings{williams95gaussian,
+  author =       "Christopher K. I. Williams and Carl Edward Rasmussen",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "{Gaussian} Processes for Regression",
+  volume =       "8",
+  publisher =    "{MIT} Press",
+  year =         "1995",
+  ISBN =         "0-262-20107-0",
+}
+
+@InProceedings{Williams96-nips,
+  author =       "C. K. I. Williams",
+  editor =       NIPS9ed,
+  booktitle =    NIPS9,
+  title =        "Computing with infinite networks",
+  publisher =    "MIT Press",
+  year =         "1997",
+}
+
+@InProceedings{WilliamsC1990,
+  author = 	 {Christopher K. I. Williams and Geoffrey E. Hinton},
+  title = 	 {Mean field networks that learn to discriminate temporally distorted strings},
+  booktitle = {Connectionist Models: Proceedings of the 1990 Connectionist Summer School},
+  year = 	 {1990},
+  address = 	 {San Mateo, CA},
+}
+
+@Article{Willshaw69,
+  author =       "D. J. Willshaw and O. P. Buneman and H. C.
+                 Longuet-Higgins",
+  title =        "Non-Holographic Associative Memory",
+  journal =      nature,
+  volume =       "222",
+  year =         "1969",
+}
+
+@Article{Willshaw76,
+  author =       "D. J. Willshaw and C. von der Malsburg",
+  title =        "How Patterned Neural Connections Can Be Set Up by
+                 Self-Organization",
+  journal =      PRSLB,
+  volume =       "194",
+  pages =        "431--445",
+  year =         "1976",
+}
+
+@Article{Wilson-2003,
+  author =       "D. Randall Wilson and Tony R. Martinez",
+  title =        "The general inefficiency of batch training for
+                 gradient descent learning",
+  journal =      "Neural Networks",
+  volume =       "16",
+  number =       "10",
+  publisher =    "Elsevier Science Ltd.",
+  address =      "Oxford, UK",
+  pages =        "1429--1451",
+  year =         "2003",
+  ISSN =         "0893-6080",
+}
+
+@InProceedings{Wilson2007,
+  author =       "D. Keith Wilson",
+  booktitle =    "Proceedings of NOISE-CON 2007",
+  title =        "Weather effects and outdoor noise exposure: Where,
+                 when, and how often to measure?",
+  address =      "Reno, Nevada",
+  year =         "2007",
+}
+
+@Article{Wilson73,
+  author =       "H. R. Wilson and J. D. Cowan",
+  title =        "A Mathematical Theory of the Functional Dynamics of
+                 Cortical and Thalamic Nervous Tissue",
+  journal =      kyb,
+  volume =       "13",
+  pages =        "55--80",
+  year =         "1973",
+}
+
+@Article{Wilson88,
+  author =       "G. V. Wilson and G. S. Pawley",
+  title =        "On the Stability of the Travelling Salesman Problem
+                 Algorithm of Hopfield and Tank",
+  journal =      biocyb,
+  volume =       "58",
+  pages =        "63--70",
+  year =         "1988",
+}
+
+@InProceedings{wilson97instance,
+  author =       "D. Randall Wilson and Tony R. Martinez",
+  booktitle =    "Proc. 14th International Conference on Machine
+                 Learning",
+  title =        "Instance pruning techniques",
+  publisher =    "Morgan Kaufmann",
+  pages =        "403--411",
+  year =         "1997",
+  URL =          "citeseer.nj.nec.com/wilson97instance.html",
+}
+
+@Book{Winograd63,
+  author =       "S. Winograd and J. D. Cowan",
+  title =        "Reliable Computation in the Presence of Noise",
+  publisher =    "MIT Press",
+  address =      "Cambridge",
+  year =         "1963",
+}
+
+@Article{Winters89,
+  author =       "J. H. Winters and C. Rose",
+  title =        "Minimum Distance Automata in Parallel Networks for
+                 Optimum Classification",
+  journal =      nn,
+  volume =       "2",
+  pages =        "127--132",
+  year =         "1989",
+}
+
+@Article{WisSej2002,
+  author =       "L. Wiskott and T. J. Sejnowski",
+  title =        "Slow Feature Analysis: Unsupervised Learning of
+                 Invariances",
+  journal =      "Neural Computation",
+  volume =       "14",
+  number =       "4",
+  pages =        "715--770",
+  year =         "2002",
+  uralbstract =  "{http://itb.biologie.hu-berlin.de/~wiskott/Abstracts/WisSej2002.html}",
+  urlpaper =     "{http://itb.biologie.hu-berlin.de/~wiskott/Publications/WisSej2002-LearningInvariances-NC.ps.gz}",
+}
+
+@TechReport{Witbrock+Zagha-1989,
+  author =       "Michael Witbrock and Marco Zagha",
+  title =        "An Implementation of Back-Propagation Learning on
+                 {GF11}, a Large {SIMD} Parallel Computer",
+  number =       "CMU-CS-89-208",
+  institution =  "Carnegie Mellon University",
+  year =         "1989",
+}
+
+@Book{Wittgenstein58,
+  author =       "L. Wittgenstein",
+  title =        "Philosophical Investigations",
+  publisher =    "Blackwell",
+  address =      "Oxford",
+  year =         "1958",
+}
+
+@InProceedings{Wittner88,
+  author =       "B. S. Wittner and J. S. Denker",
+  editor =       nips87ed,
+  booktitle =    nips87,
+  title =        "Strategies for Teaching Layered Networks
+                 Classification Tasks",
+  publisher =    "American Institute of Physics, New York",
+  address =      "Denver, CO",
+  pages =        "850--859",
+  year =         "1988",
+}
+
+@Book{WL90,
+  author =       "A. Waibel and K. F. Lee",
+  title =        "Readings in Speech Recognition",
+  publisher =    "Morgan Kaufmann",
+  year =         "1990",
+}
+
+@Article{Wolpert-1996,
+  author =       "D. H. Wolpert",
+  title =        "The lack of a priori distinction between learning
+                 algorithms",
+  journal =      "Neural Computation",
+  volume =       "8",
+  number =       "7",
+  pages =        "1341--1390",
+  year =         "1996",
+}
+
+@Article{Wolpert92,
+  author =       "D. H. Wolpert",
+  title =        "Stacked Generalization",
+  journal =      "Neural Networks",
+  volume =       "5",
+  pages =        "241--249",
+  year =         "1992",
+}
+
+@TechReport{wolpert95,
+  author =       "D. Wolpert and W. Macready",
+  title =        "No free lunch theorems for search",
+  number =       "SFI-TR-95-02-010",
+  institution =  "The Santa Fe Institute",
+  year =         "1995",
+}
+
+@article{wolpert96no,
+  author =       "D. Wolpert and W. MacReady",
+  title =        "No free lunch theorems for optimization",
+  year =         "1997",
+  journal =      "IEEE Transactions on Evolutionary Computation",
+  volume =       1,
+  pages =       {67--82},
+}
+
+@Book{wordnet-book98,
+  author =       "Christiane Fellbaum",
+  title =        "{WordNet}: An Electronic Lexical Database",
+  publisher =    "MIT Press",
+  year =         "1998",
+}
+
+@TechReport{wrong-delve-citation,
+  author =       "G. Hinton and R. Neal and R. Tibshirani",
+  title =        "Assessing learning procedures using {DELVE}",
+  institution =  "University of Toronto, Department of Computer Science,
+                 http://www.cs.utoronto.ca/neuron/delve/delve.html.",
+  year =         "1995",
+}
+
+@Article{Wu-97,
+  author =       "Zhijun Wu",
+  title =        "Global continuation for distance geometry problems",
+  journal =      "{SIAM} Journal of Optimization",
+  volume =       "7",
+  pages =        "814--836",
+  year =         "1997",
+}
+
+@Article{Wu-97-short,
+  author =       "Z. Wu",
+  title =        "Global continuation for distance geometry problems",
+  journal =      "{SIAM} J. Optimization",
+  volume =       "7",
+  pages =        "814--836",
+  year =         "1997",
+}
+
+@Article{Wu97,
+  author =       "C. H. Wu",
+  title =        "Artificial neural networks for molecular sequence
+                 analysis",
+  journal =      "Comp. Chem.",
+  volume =       "21",
+  pages =        "237--256",
+  year =         "1997",
+}
+
+@InProceedings{XingE2005,
+  author =       "Eric P. Xing and Rong Yan and Alexander G. Hauptmann",
+  booktitle =    UAI05,
+  title =        "Mining Associated Text and Images with Dual-Wing
+                 Harmoniums.",
+  publisher =    "AUAI Press",
+  pages =        "633--641",
+  year =         "2005",
+  ISBN =         "0-9749039-1-4",
+  date =         "2007-07-26",
+  OPTcrossref =  "conf/uai/2005",
+  OPTdescription = "dblp",
+  OPTee =        "http://uai.sis.pitt.edu/displayArticleDetails.jsp?mmnu=1&smnu=2&article-id=1184&proceeding-id=21",
+  OPTkeywords =  "dblp",
+}
+  %url =       "http://dblp.uni-trier.de/db/conf/uai/uai2005.html#XingYH05",
+
+@InProceedings{Xu+Rudnicky-2000,
+  author =       "Wei Xu and Alex Rudnicky",
+  booktitle =    "International Conference on Statistical Language
+                 Processing",
+  title =        "Can Artificial Neural Networks Learn Language Models",
+  address =      "Beijing, China",
+  pages =        "M1--13",
+  year =         "2000",
+}
+
+@InProceedings{Xu-Emami-Jelinek-2003,
+  author =       "P. Xu and A. Emami and F. Jelinek",
+  booktitle =    "Proceedings of the 2003 Conference on Empirical
+                 Methods in Natural Language Processing (EMNLP'2003)",
+  title =        "Training Connectionist Models for the Structured
+                 Language Model",
+  volume =       "10",
+  pages =        "160--167",
+  year =         "2003",
+}
+
+@Misc{xu-jordan-94,
+  author =       "L. Xu and M. I. Jordan",
+  title =        "Theoretical and experimental studies of convergence
+                 properties of the {EM} algorithm for unsupervised
+                 learning based on finite mixtures",
+  address =      "Snowbird, UTAH",
+  pages =        "",
+  year =         "1994",
+  note =         "Presented at the Neural Networks for Computing
+                 Conference",
+}
+
+@inproceedings{xuetal04,
+author = "Xu, L. and Neufeld, J. and Larson, B. and Schuurmans, D.",
+title = "Maximum margin clustering",
+editor =       NIPS17ed,
+booktitle =    NIPS17,
+year = 2004,
+}
+
+@inproceedings{Xu-ICML-2006,
+author = "Xu, L. and Wilkinson, D. and Southey, F. and Schuurmans, D.",
+title = "Discriminative unsupervised learning of structured predictors",
+booktitle =    ICML06,
+editor =       ICML06ed,
+publisher =    ICML06publ,
+year = 2006,
+}
+
+@InProceedings{Xu-AAAI-2006,
+  author =       "L. Xu and K. Crammer and D. Schuurmans",
+  booktitle =    "Twenty-first National Conference on Artificial
+                 Intelligence (AAAI-06)",
+  title =        "Robust support vector machine training via convex
+                 outlier ablation",
+  year =         "2006",
+}
+
+
+
+@Misc{YA97a,
+  author =       "Howard Hua Yang and {Shun-ichi} Amari",
+  title =        "Natural Gradient Descent for Training Multi-Layer
+                 Perceptrons",
+  year =         "1997",
+  URL =          "citeseer.ist.psu.edu/hua96natural.html",
+}
+
+@Article{yang98complexity,
+  author =       "Howard Hua Yang and {Shun-ichi} Amari",
+  title =        "Complexity Issues in Natural Gradient Descent Method
+                 for Training Multi-Layer Perceptrons",
+  journal =      "Neural Computation",
+  volume =       "10",
+  number =       "8",
+  pages =        "2137--2157",
+  year =         "1998",
+  URL =          "citeseer.ist.psu.edu/91462.html",
+}
+
+@inproceedings{Yang+al-2006,
+    author = {Xin Yang and Haoying Fu and Hongyuan Zha and Jesse Barlow},
+    title = {Semi-supervised nonlinear dimensionality reduction},
+    booktitle = {Proceedings of the 23rd International Conference on Machine Learning},
+    year = {2006},
+    isbn = {1-59593-383-2},
+    pages = {1065--1072},
+    location = {Pittsburgh, Pennsylvania},
+    doi = {http://doi.acm.org/10.1145/1143844.1143978},
+    publisher = {ACM},
+    address = {New York, NY, USA},
+}
+
+@misc{Yang+Jin-2006,
+    author = {Liu Yang and Rong Jin},
+    title = {Distance Metric Learning: A Comprehensive Survey},
+    year = 2006,
+    note = {url{http://www.cse.msu.edu/~yangliu1/frame\_survey\_v2.pdf}},
+}
+
+@misc{Yang-2007,
+    author = {Liu Yang},
+    title = {An Overview of Distance Metric Learning},
+    year = 2007,
+    note = {url{http://www.cse.msu.edu/~yangliu1/dist\_overview.pdf}},
+}
+
+@InProceedings{YangL2007,
+  author =       "Liu Yang and Rong Jin and Caroline Pantofaru and Rahul
+                 Sukthankar",
+  booktitle =    cvpr07,
+  title =        "Discriminative Cluster Refinement: Improving Object
+                 Category Recognition Given Limited Training Data",
+  month =        jun,
+  year =         "2007",
+}
+
+@InProceedings{Yao85,
+  author =       "Andrew Yao",
+  booktitle =    "Proceedings of the 26th Annual {IEEE} Symposium on
+                 Foundations of Computer Science",
+  title =        "Separating the polynomial-time hierarchy by oracles",
+  pages =        "1--10",
+  year =         "1985",
+}
+
+@InProceedings{Yarowsky-92,
+  author =       "David Yarowsky",
+  booktitle =    "Proceedings of the 14th International Conference on
+                 Computational Linguistics (COLING-92)",
+  title =        "Word-sense disambiguation using statistical models of
+                 {Roget}'s categories trained on large corpora",
+  address =      "Nantes, France",
+  pages =        "454--460",
+  year =         "1992",
+}
+
+@InProceedings{Yarowsky-93,
+  author =       "David Yarowsky",
+  booktitle =    "{ARPA} Workshop on Human Language Technology",
+  title =        "One sense per collocation",
+  address =      "Princeton, {NJ}",
+  year =         "1993",
+}
+
+@InProceedings{Yarowsky-95,
+  author =       "David Yarowsky",
+  booktitle =    "33rd Annual Meeting of the {ACL}",
+  title =        "Unsupervised word sense disambiguation rivaling
+                 supervised methods",
+  address =      "Cambridge, {MA}",
+  pages =        "189--196",
+  year =         "1995",
+}
+
+@InProceedings{Yarowsky1994,
+  author =       "David Yarowsky",
+  booktitle =    "Meeting of the Association for Computational
+                 Linguistics",
+  title =        "Decision Lists for Lexical Ambiguity Resolution:
+                 Application to Accent Restoration in Spanish and
+                 French",
+  pages =        "88--95",
+  year =         "1994",
+  URL =          "citeseer.nj.nec.com/yarowsky94decision.html",
+}
+
+@InProceedings{Yarowsky1995,
+  author =       "David Yarowsky",
+  booktitle =    "Meeting of the Association for Computational
+                 Linguistics",
+  title =        "Unsupervised Word Sense Disambiguation Rivaling
+                 Supervised Methods",
+  pages =        "189--196",
+  year =         "1995",
+  URL =          "citeseer.nj.nec.com/yarowsky95unsupervised.html",
+}
+
+@TechReport{Yianilos95,
+  author =       "Peter N. Yianilos",
+  title =        "Metric Learning via Normal Mixtures",
+  institution =  "NEC Research Institute",
+  address =      "Princeton, NJ",
+  month =        oct,
+  year =         "1995",
+}
+
+@InProceedings{Younes98onthe,
+    author = {Laurent Younes},
+    title = {On The Convergence Of Markovian Stochastic Algorithms With Rapidly Decreasing Ergodicity Rates},
+    booktitle = {Stochastics and Stochastics Models},
+    year = {1998},
+    pages = {177--228}
+}
+
+@Article{Young+Sachs79,
+  author =       "E. D. Young and M. B. Sachs",
+  title =        "Representation of steady-state vowels in the temporal
+                 aspects of the discharge pattern of population of
+                 auditory nerve fibers",
+  journal =      jasa,
+  volume =       "66",
+  number =       "5",
+  pages =        "1381--1403",
+  year =         "1979",
+}
+
+@InProceedings{Yu+Simmons90,
+  author =       "Y. H. Yu and R. F. Simmons",
+  booktitle =    ijcnn,
+  title =        "Extra output biased learning",
+  publisher =    "Lawrence Erlbaum, Hillsdale",
+  address =      "Washington 1990",
+  year =         "1990",
+}
+
+@Article{Yu-trnn92,
+  author =       "X. H. Yu",
+  title =        "Can Backpropagation Error Surface Not Have Local
+                 Minima?",
+  journal =      ieeetrnn,
+  volume =       "3",
+  number =       "6",
+  pages =        "1019--1020",
+  year =         "1992",
+}
+
+@Article{Yu92,
+  author =       "X. H. Yu",
+  title =        "Can Backpropagation Error Surface Not Have Local
+                 Minima?",
+  journal =      ieeetrnn,
+  volume =       "3",
+  number =       "6",
+  pages =        "1019--1020",
+  year =         "1992",
+}
+
+@InProceedings{Yuille2005,
+  author =       "Alan L. Yuille",
+  editor =       NIPS17ed,
+  booktitle =    NIPS17,
+  title =        "The Convergence of Contrastive Divergences",
+  publisher =    "{MIT} Press",
+  pages =        "1593--1600",
+  year =         "2005",
+}
+
+@Article{Yuille89,
+  author =       "Alan L. Yuille and D. M. Kammen and D. S. Cohen",
+  title =        "Quadrature and the Development of Orientation
+                 Selective Cortical Cells by Hebb Rules",
+  journal =      biocyb,
+  volume =       "61",
+  pages =        "183--194",
+  year =         "1989",
+}
+
+@Article{Yuille90,
+  author =       "Alan L. Yuille",
+  title =        "Generalized Deformable Models, Statistical Physics,
+                 and Matching Problems",
+  journal =      "Neural Computation",
+  volume =       "2",
+  number =       "1",
+  pages =        "1--24",
+  year =         "1990",
+}
+
+@Article{Zak-nn92,
+  author =       "M. Zak",
+  title =        "Terminal Attractors in Neural Networks",
+  journal =      nn,
+  volume =       "2",
+  pages =        "259--274",
+  year =         "1989",
+}
+
+@Article{Zak88,
+  author =       "M. Zak",
+  title =        "Terminal Attractors for Addressable Memory in Neural
+                 Networks",
+  journal =      plettA,
+  volume =       "133",
+  pages =        "18--22",
+  year =         "1988",
+}
+
+@Article{Zak89,
+  author =       "M. Zak",
+  title =        "Terminal Attractors in Neural Networks",
+  journal =      nn,
+  volume =       "2",
+  pages =        "259--274",
+  year =         "1989",
+}
+
+@Article{Zavaliagkos93,
+  author =       "G. Zavaliagkos and S. Austin and J. Makhoul and R.
+                 Schwartz",
+  title =        "A Hybrid Continuous Speech Recognition System Using
+                 Segmental Neural Nets with Hidden {Markov} Models",
+  journal =      "Int. Journal of Pattern Recognition and Artificial
+                 Intelligence",
+  pages =        "305--319",
+  year =         "1993",
+  note =         "Special Issue on Applications of Neural Networks to
+                 Pattern Recognition (I. Guyon Ed.)",
+}
+
+@InProceedings{Zell+al-1993,
+  author =       "Andreas Zell and Niels Mache and Michael Vogt and
+                 Markus H{\"u}ttel",
+  booktitle =    "Proceedings of the IEEE International Conference on
+                 Neural Networks",
+  title =        "Problems of Massive Parallelism in Neural Network
+                 Simulation",
+  volume =       "3",
+  address =      "San Francisco, CA",
+  pages =        "1890--1895",
+  year =         "1993",
+}
+
+@InProceedings{Zemel90,
+  author =       "R. S. Zemel and M. C. Mozer and G. E. Hinton",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "Recognizing objects using hierarchical reference frame
+                 transformations",
+  address =      "San Mateo, CA",
+  year =         "1990",
+}
+
+@PhdThesis{Zemel93-thesis,
+  author =       "Richard S. Zemel",
+  title =        "A Minimum Description Length Framework for
+                 Unsupervised Learning",
+  school =       "University of Toronto",
+  year =         "1993",
+}
+
+@InProceedings{Zha2002,
+  author =       "H. Zha and C. Ding and M. Gu and X. He and H. Simon",
+  editor =       NIPS14ed,
+  booktitle =    NIPS14,
+  title =        "Spectral relaxation for {K}-means clustering",
+  publisher =    "{MIT} Press",
+  year =         "2002",
+}
+
+@InProceedings{Zhang-nips90,
+  author =       "X. Zhang and Others",
+  editor =       NIPS2ed,
+  booktitle =    NIPS2,
+  title =        "An Efficient Implementation of the Backpropagation
+                 Algorithm on the Connection Machine {CM}-2",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  pages =        "801--809",
+  year =         "1990",
+}
+
+@Misc{zhang-workshop-2005,
+  author =       "Jian Zhang",
+  title =        "Sparsity Models for Multi-task Learning",
+  howpublished = "'Inductive Transfer: 10 Years Later' NIPS Workshop",
+  year =         "2005",
+  OPTkey =       "",
+}
+
+@TechReport{Zhang2001,
+  author =       "Bin Zhang",
+  title =        "Is the Maximal Margin Hyperplane Special in a Feature
+                 Space?",
+  number =       "HPL-2001-89",
+  institution =  "Hewlett-Packards Labs",
+  year =         "2001",
+}
+
+@article{Zhang+Zha-2005,
+    address = {Philadelphia, PA},
+    author = {Zhang, Zhenyue   and Zha, Hongyuan  },
+    doi = {10.1137/S1064827502419154},
+    issn = {1064-8275},
+    journal = {SIAM Journal on Scientific Computing},
+    number = {1},
+    pages = {313--338},
+    publisher = {Society for Industrial and Applied Mathematics},
+    title = {Principal Manifolds and Nonlinear Dimensionality Reduction via Tangent Space Alignment},
+    url = {http://portal.acm.org/citation.cfm?id=1024004.1039898},
+    volume = {26},
+    year = {2005}
+}
+
+@InProceedings{Zhang+al-2007,
+    author = {D. Zhang and Z. H. Zhou and S. Chen},
+    title = {Semi-supervised dimensionality reduction},
+    booktitle = {Proceedings of the 7th SIAM International Conference on Data Mining},
+    address = {Minneapolis, MN},
+    year = 2007,
+}
+
+@article{Zhao+al-2006,
+    author = {Haitao Zhao and Shaoyuan Sun and Zhongliang Jing and Jingyu Yang},
+    title = {Local structure based supervised feature extraction},
+    journal = {Pattern Recognition},
+    volume = {39},
+    number = {8},
+    year = {2006},
+    issn = {0031-3203},
+    pages = {1546--1550},
+    doi = {http://dx.doi.org/10.1016/j.patcog.2006.02.023},
+    publisher = {Elsevier Science Inc.},
+    address = {New York, NY, USA},
+}
+
+@InProceedings{Zhou+al-2004,
+  author =       "D. Zhou and O. Bousquet and T. {Navin Lal} and J.
+                 Weston and B. Sch{\"o}lkopf",
+  editor =       NIPS16ed,
+  booktitle =    NIPS16,
+  title =        "Learning with local and global consistency",
+  publisher =    "MIT Press",
+  address =      "Cambridge, MA",
+  pages =        "321--328",
+  year =         "2004",
+  keywords =     "semi-supervised learning, manifold, kernel methods",
+}
+
+@InProceedings{Zhou+Dapkus-1995,
+  author =       "J. Zhou and P. Dapkus",
+  booktitle =    "Proceedings of the Third Workshop on Very Large
+                 Corpora",
+  title =        "Automatic Suggestion of Significant Terms for a
+                 Predefined Topic",
+  address =      "Cambridge",
+  pages =        "131--147",
+  year =         "1995",
+}
+
+@InProceedings{Zhou+Tanner-1997,
+  author =       "Joe Zhou and Troy Tanner",
+  booktitle =    "Proceedings of the fifth conference on Applied natural
+                 language processing",
+  title =        "Construction and visualization of key term
+                 hierarchies",
+  publisher =    "Morgan Kaufmann Publishers Inc.",
+  address =      "San Francisco, CA, USA",
+  pages =        "307--311",
+  year =         "1997",
+  location =     "Washington, DC",
+}
+
+@InProceedings{zhou2002,
+  author =       "Z.-H. Zhou and M.-L. Zhang",
+  booktitle =    "Proceedings of the International Conference on
+                 Intelligent Information Technology, 2002, pp.455-459",
+  title =        "Neural Networks for Multi-Instance Learning",
+  address =      "Beijing, China",
+  year =         "2002",
+  page =         "455-459",
+}
+
+@InProceedings{ZhouX2007,
+  author =       "Xiaojin Zhu and Timothy J. Rogers and Ruichen Qian and
+                 Chuck Kalish",
+  booktitle =    "AAAI",
+  title =        "Humans Perform Semi-Supervised Classification Too.",
+  publisher =    "AAAI Press",
+  pages =        "864",
+  year =         "2007",
+  ISBN =         "978-1-57735-323-2",
+  URL =          "http://dblp.uni-trier.de/db/conf/aaai/aaai2007.html#ZhuRQK07",
+  date =         "2007-09-05",
+  description =  "dblp",
+  keywords =     "dblp",
+}
+
+@article{Zhu2009,
+ author = {Long Zhu and Yuanhao Chen and Alan Yuille},
+ title = {Unsupervised Learning of Probabilistic Grammar-Markov Models for Object Categories},
+ journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
+ volume = 31,
+ number = 1,
+ pages = {114--128},
+ year = 2009,
+}
+
+@InProceedings{Zhu+al-2003,
+  author =       "Xiaojin Zhu and Zoubin Ghahramani and John Lafferty",
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  title =        "Semi-supervised learning using {Gaussian} fields and
+                 harmonic functions",
+  pages =        "912--919",
+  year =         "2003",
+}
+
+@TechReport{Zhu+al-TR2003,
+  author =       "Xiaojin Zhu and John Lafferty and Zoubin Ghahramani",
+  title =        "Semi-Supervised Learning: From {G}aussian Fields to
+                 {G}aussian Processes",
+  number =       "CMU-CS-03-175",
+  institution =  "CMU",
+  year =         "2003",
+}
+
+@Article{Zhu-2006,
+  author =       "M. Zhu and W. Su and H. A. Chipman",
+  title =        "{LAGO}: {A} computationally efficient approach for
+                 statistical detection",
+  journal =      "Technometrics",
+  volume =       "48",
+  number =       "2",
+  pages =        "193--205",
+  year =         "2006",
+}
+
+@InProceedings{Zhu-ijcai-2005,
+  author =       "Tingshao Zhu and Russ Greiner and Gerald Haeubl and
+                 Kevin Jewell and Bob Price",
+  booktitle =    "Nineteenth International Joint Conference on
+                 Artificial Intelligence (IJCAI-05)",
+  title =        "Using Learned Browsing Behavior Models to Recommend
+                 Relevant Web Pages",
+  address =      "Edinburgh, U.K.",
+  pages =        "1589--1591",
+  year =         "2005",
+}
+
+@TechReport{Zhu-Lafferty-Ghahramani-2003,
+  author =       "Xiaojin Zhu and John Lafferty and Zoubin Ghahramani",
+  title =        "Semi-supervised learning: from {G}aussian fields to
+                 {G}aussian processes",
+  number =       "CMU-CS-03-175",
+  institution =  "School of Computer Science, Carnegie Mellon
+                 University",
+  year =         "2003",
+}
+
+@Article{zhu-rohwer96,
+  author =       "H. Zhu and R. Rohwer",
+  title =        "No free lunch for cross validation",
+  journal =      "Neural Computation",
+  volume =       "8",
+  number =       "7",
+  pages =        "1421--1426",
+  year =         "1996",
+}
+
+@TechReport{zhu05survey,
+  author =       "Xiaojin Zhu",
+  title =        "Semi-Supervised Learning Literature Survey",
+  number =       "1530",
+  institution =  "Computer Science, University of Wisconsin-Madison",
+  year =         "2005",
+  note =         "http://www.cs.wisc.edu/$\sim$jerryzhu/pub/ssl\-survey.pdf",
+}
+
+@TechReport{ZhuX2002,
+  author =       "Xiaojin Zhu and Zoubin Ghahramani",
+  title =        "Towards semisupervised classification with Markov
+                 random fields",
+  institution =  "Carnegie Mellon University",
+  year =         "2002",
+}
+
+@inproceedings{Zinkevich-2003,
+  author = {Martin Zinkevich},
+  title ={Online convex programming and generalized infinitesimal gradient ascent},
+  booktitle =    ICML03,
+  editor =       ICML03ed,
+  publisher =    ICML03publ,
+  pages =        "928--936",
+  year =         "2003",
+}
+
+@InProceedings{Zoubin-nips8,
+  author =       "Z. Ghahramani and M. I. Jordan",
+  editor =       NIPS8ed,
+  booktitle =    NIPS8,
+  title =        "Factorial Hidden Markov Models",
+  publisher =    "MIT Press, Cambridge, MA",
+  year =         "1996",
+}
+
+@InProceedings{Zoubin-nips94,
+  author =       "Z. Ghahramani and M. I. Jordan",
+  editor =       NIPS6ed,
+  booktitle =    NIPS6,
+  title =        "Supervised learning from incomplete data via an {EM}
+                 approach",
+  publisher =    "Morgan Kaufmann",
+  address =      "San Mateo, CA",
+  year =         "1994",
+}
+
+@TechReport{Zoubin-tr93,
+  author =       "Z. Ghahramani and M. I. Jordan",
+  title =        "Function approximation via density estimation using
+                 the {E}{M} approach",
+  type =         "Computational Cognitive Science",
+  number =       "TR 9304",
+  institution =  "MIT",
+  year =         "1993",
+}
+
+@TechReport{Zoubin96,
+  author =       "Z. Ghahramani and G. E. Hinton",
+  title =        "Parameter estimation for linear dynamical systems",
+  number =       "Technical Report CRG-TR-91-1",
+  institution =  "University of Toronto",
+  year =         "1996",
+}
+
+@TechReport{Zoubin96b,
+  author =       "Z. Ghahramani and G. E. Hinton",
+  title =        "Switching state-space models",
+  number =       "Technical Report CRG-TR-91-3",
+  institution =  "University of Toronto",
+  year =         "1996",
+}
+
+@Article{Zue90a,
+  author =       "V. Zue and S. Seneff and J. Glass",
+  title =        "Speech database development: {TIMIT} and beyond",
+  journal =      spcomm,
+  volume =       "9",
+  number =       "4",
+  pages =        "351--356",
+  month =        aug,
+  year =         "1990",
+}
+
+@InProceedings{Zue90b,
+  author =       "V. Zue and J. Glass and D. Goddeau and D. Goodine and
+                 H. Leung and M. McCandless and M. Phillips and J.
+                 Polifroni and S. Seneff and D. Whitney",
+  booktitle =    "Proc. Int. Conf. Spoken Languague Processing",
+  title =        "Recent progress on the {MIT} {VOYAGER} spoken language
+                 system",
+  address =      "Kobe, Japan",
+  pages =        "29.6.1",
+  year =         "1990",
+}
+
+@InProceedings{Zwald+al-2004,
+  author =       "Laurent Zwald and Olivier Bousquet and Gilles
+                 Blanchard",
+  editor =       "John Shawe-Taylor and Yoram Singer",
+  booktitle =    colt04,
+  title =        "Statistical Properties of Kernel Principal Component
+                 Analysis",
+  volume =       "3120",
+  publisher =    "Springer-Verlag",
+  pages =        "594--608",
+  year =         "2004",
+  series =       "Lecture Notes in Computer Science",
+}
+
+@InProceedings{Zweig+Russel-AAAI98,
+  author =       "G. Zweig and S. Russel",
+  booktitle =    "Proceedings of the AAAI Conference",
+  title =        "Speech Recognition with Dynamic {Bayesian} Networks",
+  publisher =    "AAAI Press",
+  address =      "Madison, Wisconsin",
+  year =         "1998",
+}
+
+@InProceedings{Zweig+Russel-ICSLP98,
+  author =       "G. Zweig and S. Russel",
+  booktitle =    "Proceedings of the International Conference on
+                 Statistical Language Processing",
+  title =        "Probabilistic Modeling with {Bayesian} Networks for
+                 {ASR}",
+  address =      "Sidney, Australia",
+  year =         "1998",
+}
+
+@Article{Zwicker+Terhardt80,
+  author =       "E. Zwicker and E. Terhardt",
+  title =        "Analytical expressions for critical band rate and
+                 critical bandwidths as a function of frequency",
+  journal =      jasa,
+  volume =       "68",
+  number =       "5",
+  pages =        "1523--1525",
+  year =         "1980",
+}
+
+@Proceedings{colt03,
+  editor =       "Bernhard Sch{\"o}lkopf and Manfred K. Warmuth",
+  booktitle =    colt03,
+  title =        "Computational Learning Theory and Kernel Machines,
+                 16th Annual Conference on Computational Learning Theory
+                 and 7th Kernel Workshop, {COLT}/Kernel 2003,
+                 Washington, {DC}, {USA}, August 24-27, 2003,
+                 Proceedings",
+  volume =       "2777",
+  publisher =    "Springer",
+  year =         "2003",
+  series =       "Lecture Notes in Computer Science",
+}
+
+@Proceedings{FOCS3,
+  booktitle =    "Proceedings of the Third Annual Symposium on Switching
+                 Circuit Theory and Logical Design",
+  title =        "Proceedings of the Third Annual Symposium on Switching
+                 Circuit Theory and Logical Design",
+  organization = "American Institute of Electrical Engineers",
+  address =      "Chicago, Illinois",
+  month =        "7--12" # oct,
+  year =         "1962",
+  crossrefonly = "1",
+  url =       "http://theory.lcs.mit.edu/~dmjones/FOCS/focs.bib",
+}
+
+@Book{TricksOfTheTrade,
+  editor =       "Genevieve Orr and Klaus-Robert Muller",
+  booktitle =    "Neural networks: tricks of the trade",
+  title =        "Neural networks: tricks of the trade",
+  volume =       "1524",
+  publisher =    "Springer-Verlag Inc.",
+  address =      "New York, NY, USA",
+  pages =        "vi + 432",
+  year =         "1998",
+  ISBN =         "3-540-65311-2 (paperback)",
+  ISSN =         "0302-9743",
+  LCCN =         "QA76.87.N4913 1998",
+  bibdate =      "Sat Jan 9 14:35:31 1999",
+  series =       "Lecture Notes in Computer Science",
+  acknowledgement = ack-nhfb,
+  keywords =     "Neural networks (Computer science)",
+}
+
+@Article{Besag75pseudolikelihood,
+  author =       "Julian Besag",
+  title =        "Statistical analysis of non-lattice data",
+  journal =      "The Statistician",
+  volume =       "24",
+  number =       "3",
+  pages =        "179--195",
+  year =         "1975",
+}
+
+@INPROCEEDINGS{Marlin05unsupervisedlearning,
+    author = {Benjamin Marlin and Richard S. Zemel and Sam T. Roweis},
+    title = {Unsupervised learning with non-ignorable missing data},
+    booktitle = {In Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS 2005)},
+    year = {2005},
+    pages = {222--229}
+}
+
+@PhdThesis{MarlinThesis08,
+  author = "Benjamin M. Marlin",
+  title =  "Missing Data Problems in Machine Learning",
+  school = "Dept. of Computer Science, University of Toronto",
+  year =   "2008"
+}
+
+@inproceedings{odonnellservedio08,
+author = "{O'Donnell}, R. and Servedio, R.",
+title = "The {Chow} parameters problem",
+booktitle = "Proceedings of the Fortieth Annual Symposium on Theory of 
+Computing (STOC)",
+year = 2008,
+pages = "517-526",
+}
+
+@article{bendaviddichterman98,
+author = "{Ben-David}, S. and Dichterman, E.",
+title = "Learning with restricted focus of attention",
+journal = "Journal of Computer and System Sciences",
+volume = 56,
+numer = 3,
+year = 1998,
+pages = "277-298",
+}
+
+@techreport{cma07,
+author = "Canadian Medical Association",
+title = "Information technology and health care in Canada: 2007 status report",
+year = 2007,
+}
+
+@article{hanetal05,
+author = "Y. Han and J. Carcillo and S. Venkataraman and R. Clark and 
+R. Watson and T. Nguyen and H. Bayir and R. Orr",
+title = "Unexpected increased mortality after implementation 
+of a commercially sold computerized physician order entry system",
+journal = "Pediatrics",
+volume = "116",
+number = 6,
+pages = "1506-1512",
+year = 2005,
+}
+
+@InProceedings{conf/uai/McCallum03,
+  title =   "Efficiently Inducing Features of Conditional Random
+         Fields",
+  author =  "Andrew McCallum",
+  booktitle =  UAI03,
+  publisher =   "Morgan Kaufmann",
+  date = "August 7-10",
+  location = "Acapulco, Mexico",
+  year =    "2003",
+  editor =  "Christopher Meek and Uffe Kj{\ae}rulff",
+  ISBN =    "0-127-05664-5",
+  pages =   "403--410",
+}
+
+
+@InProceedings{conf/uai/McCallum03-small,
+  title =   "Efficiently Inducing Features of Conditional Random
+         Fields",
+  author =  "A. McCallum",
+  booktitle =   "UAI",
+  year =    "2003",
+}
+
+
+@InProceedings{conf/icml/RanzatoS08,
+  title =   "Semi-supervised learning of compact document
+         representations with deep networks",
+  author =  "Marc'Aurelio Ranzato and Martin Szummer",
+  booktitle = ICML08,
+  editor =  ICML08ed,
+  publisher = ICML08publ,
+  year =    "2008",
+  volume =  "307",
+  ISBN =    "978-1-60558-205-4",
+  pages =   "792--799",
+  series =  "ACM International Conference Proceeding Series",
+  date =    "June 5-9, 2008",
+  location = "Helsinki, Finland",
+  URL =     "http://doi.acm.org/10.1145/1390156.1390256",
+}
+
+@InProceedings{conf/icml/RanzatoS08-small,
+  title =   "Semi-supervised learning of compact document
+         representations with deep networks",
+  author =  "M. Ranzato and M. Szummer",
+  booktitle =   "ICML",
+  year =    "2008",
+}
+
+@PhdThesis{Cosatto02sample-basedtalking-head,
+    author = {Eric Cosatto and Prof Murat Kunt},
+    title = {Sample-Based Talking-Head Synthesis},
+    institution = {Signal Processing Lab, Swiss Federal Institute of Techology},
+    year = {2002}
+}
+
+@incollection{SutskeverHintonTaylor2009,
+ title = {The Recurrent Temporal Restricted Boltzmann Machine},
+ author = {Ilya Sutskever and Geoffrey E Hinton and Graham Taylor},
+ editor = NIPS21ed,
+ booktitle = NIPS21,
+ pages = {1601--1608},
+ year = {2009}
+}
+
+@TechReport{Bergstra+2009-small,
+  author =       "J. Bergstra and G. Desjardins and P. Lamblin and Y. Bengio",
+  title =        "Quadratic Polynomials Learn Better Image Features",
+  number =       "1337",
+  institution =  "DIRO, Universit\'e de Montr\'eal",
+  year =         "2009",
+}
+
+@inproceedings{Haffner+al-1998,
+ author = {Haffner, P. and Bottou, L. and Howard, P. G. and Simard, P. and Bengio, Y. and Cun, Y. Le},
+ title = {Browsing through High Quality Document Images with {DjVu}},
+ booktitle = {Proceedings of the Advances in Digital Libraries Conference (ADL'98)},
+ year = {1998},
+ isbn = {0-8186-8464-X},
+ pages = {309},
+ publisher = {IEEE Computer Society},
+ address = {Washington, DC, USA},
+ }
+
+@inproceedings{Bottou+Howard+Bengio-1998,
+ author = {Bottou, L. and Howard, P. G. and Bengio, Y.},
+ title = {The {Z}-Coder Adaptive Binary Coder},
+ booktitle = {Proceedings of the Conference on Data Compression (DCC'98)},
+ year = {1998},
+ pages = {13},
+ publisher = {IEEE Computer Society},
+ address = {Washington, DC, USA},
+ }
+
+@inproceedings{Pigeon+Bengio-1998,
+  author    = {Steven Pigeon and
+               Yoshua Bengio},
+  title     = {A Memory-Efficient Adaptive Huffman Coding Algorithm for
+               Very Large Sets of Symbols},
+  booktitle = {Proceedings of the Conference on Data Compression (DCC'98)},
+  year      = {1998},
+  pages     = {568},
+  ee        = {http://dlib.computer.org/conferen/dcc/8406/pdf/84060568.pdf},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@INPROCEEDINGS{LeCun+Bottou+Bengio-1997,
+title={Reading checks with multilayer graph transformer networks},
+author={Yann LeCun and Bottou, L. and Bengio, Y.},
+booktitle={IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP'97)},
+year={1997},
+month={Apr},
+volume={1},
+pages={151--154},
+keywords={backpropagation, banking, cheque processing, document image processing, image segmentation, optical character recognitionbusiness checks, business cheques, check reading system, cheque reading system, convolutional neural network character recognizers, gradient-based learning algorithms, graph-based stochastic models, machine learning paradigm, multilayer graph transformer networks, personal checks, personal cheques},
+doi={10.1109/ICASSP.1997.599580},
+ }
+
+@INPROCEEDINGS{Rahim97discriminativefeature,
+    author = {Mazin Rahim and Yoshua Bengio and Yann {LeCun}},
+    title = {Discriminative Feature And Model Design For Automatic Speech Recognition},
+    booktitle = {In Proc. of Eurospeech},
+    year = {1997},
+    pages = {75--78}
+}
+
+@InProceedings{Bengio-nncm-1996,
+author = {Yoshua Bengio},
+title = {Training A Neural Network with a Financial Criterion Rather then a Prediction Criterion},
+booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets (NNCM-96)},
+editor = { A.S. Weigend and Y.S Abu-Mostafa and A.-P.N. Regenes},
+publisher = {World Scientific},
+pages = {433--443},
+year = "1997",
+}
+
+@INPROCEEDINGS{Bengio+Bengio+Cloutier-1994,
+title={Use of genetic programming for the search of a new learning rule for neural networks},
+author={Bengio, S. and Bengio, Y. and Cloutier, J.},
+booktitle={Proceedings of the First IEEE Conference on Evolutionary Computation},
+year={1994},
+month={Jun},
+pages={324-327 vol.1},
+keywords={ backpropagation, genetic algorithms, learning (artificial intelligence), neural nets, optimisation, search problems backpropagation algorithm, classification tasks, genetic algorithms, genetic programming, gradient descent, learning rule, neural networks, optimization, parametric function, rule parameters, search, simulated annealing, standard optimization methods},
+doi={10.1109/ICEC.1994.349932},
+}
+
+@article{Chakraborty+al-2002,
+ author = {Chakraborty, Basabi and Chakraborty, Goutam},
+ title = {A new feature extraction technique for on-line recognition of handwritten alphanumeric characters},
+ journal = {Inf. Sci. Appl.},
+ volume = {148},
+ number = {1-4},
+ year = {2002},
+ issn = {0020-0255},
+ pages = {55--70},
+ doi = {http://dx.doi.org/10.1016/S0020-0255(02)00276-1},
+ publisher = {Elsevier Science Inc.},
+ address = {New York, NY, USA},
+ }
+
+
+@INPROCEEDINGS{LeCun+al-1993,
+title={On-Line handwriting recognition with neural networks: spatial representation versus temporal representation},
+author={{LeCun}, Y and Bengio, Y. and Henderson, D. and Weisbuch, A.},
+booktitle={Proceedings of the International Conference on Handwriting and Drawing},
+year={1993},
+location= {Ecole Nationale Superieure des Telecommunications},
+}
+
+@INPROCEEDINGS{Bengio+al-92,
+    author = {Yoshua Bengio and Samy Bengio and Jocelyn Cloutier and Jan Gecsei},
+    title = {On the Optimization of a Synaptic Learning Rule},
+    booktitle = {in Conference on Optimality in Biological and Artificial Networks},
+    year = {1992}
+}
+
+@INPROCEEDINGS{Bengio+al-91,
+    author = {Yoshua Bengio and Samy Bengio and Jocelyn Cloutier and Jan Gecsei},
+    title = {Learning a Synaptic Learning Rule},
+    booktitle = ijcnn,
+    location = "Seattle, WA",
+    pages = "II-A969",
+    year = {1991}
+}
+
+@INPROCEEDINGS{Bengio91acomparative,
+    author = {Yoshua Bengio and Renato De Mori and Giovanni Flammia and Ralf Kompe},
+    title = {A Comparative Study On Hybrid Acoustic Phonetic Decoders Based On Artificial Neural Networks},
+    booktitle = {Proceeding of EuroSpeech},
+    location = {Genova, Italy},
+    year = {1991}
+}
+
+@inproceedings { lecun-01a,
+original =      "orig/lecun-01a.ps.gz",
+author = 	"{LeCun}, Y. and Bottou, L. and Bengio, Y. and Haffner, P.",
+title = 	"Gradient-Based Learning Applied to Document Recognition",
+booktitle =     "Intelligent Signal Processing",
+editors =       "Haykin, S. and Kosko, B.",
+pages =         "306-351",
+publisher =     "IEEE Press",
+note =          "chap. 9",
+year =		2001,
+}
+
+@InCollection{Hochreiter+al-2000,
+    abstract = {Introduction Recurrent networks (crossreference Chapter 12) can, in principle, use their feedback connections to store representations of recent input events in the form of activations. The most widely used algorithms for learning what to put in short-term memory, however, take too much time to be feasible or do not work well at all, especially when minimal time lags between inputs and corresponding teacher signals are long. Although theoretically fascinating, they do not provide clear practical advantages over, say, backprop in feedforward networks with limited time windows (see crossreference Chapters 11 and 12). With conventional \&\#034;algorithms based on the computation of the complete gradient\&\#034;, such as \&\#034;Back-Propagation Through Time\&\#034; (BPTT, e.g., [22, 27, 26]) or \&\#034;Real-Time Recurrent Learning\&\#034; (RTRL, e.g., [21]) error signals \&\#034;flowing backwards in time\&\#034; tend to either (1) blow up or (2) vanish: the temporal evolution of the backpropagated error ex},
+    author = {Hochreiter, Sepp and Informatik, Fakultat F. and Bengio, Yoshua and Frasconi, Paolo and Schmidhuber, Jurgen},
+    citeulike-article-id = {4450697},
+    citeulike-linkout-0 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.7321},
+    keywords = {gradient-descent, long-term-dependencies, rnn},
+    posted-at = {2009-05-02 00:58:01},
+    priority = {2},
+    title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies},
+    url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.7321},
+    booktitle =    "Field Guide to Dynamical Recurrent Networks",
+    editor = "J. Kolen and S. Kremer",
+    publisher = "IEEE Press",
+    year = "2000",
+}
+
+@INPROCEEDINGS{Lecun99objectrecognition,
+    author = {Yann {LeCun} and Patrick Haffner and Léon Bottou and Yoshua Bengio},
+    title = {Object Recognition with Gradient-Based Learning},
+    booktitle = {Shape, Contour and Grouping in Computer Vision},
+    year = {1999},
+    publisher = {Springer},
+    pages = {319--345},
+}
+
+
+% non-ref conference
+@MISC{snowbird_learn_conf,
+title = "Snowbirds papers",
+author = "many authors",
+howpublished = "Learning Conference, Snowbird",
+location = "Utah",
+year = "many",
+}
+
+@MISC{Collobert+Bengio-2001,
+title = "Magic Mix",
+author = "Collobert, R. and Bengioy, Y.",
+year = "2002",
+howpublished = "Learning Conference, Snowbird",
+location = "Utah",
+}
+
+@MISC{Bengio+al-2001,
+title = "Learning a Distributed Representation for Statistical Language Modeling and Information Retrieval",
+author = "Yoshua Bengio and Pascal Vincent and Florence d'Alché-Buc",
+year = "2001",
+howpublished = "Learning Conference, Snowbird",
+location = "Utah",
+}
+
+@MISC{Bengio+Nadeau-2000,
+title = "About Realistic Comparisons Between Learning Algorithms",
+author = "Yoshua Bengio and C. Nadeau",
+year = "2000",
+howpublished = "Learning Conference, Snowbird",
+location = "Utah",
+}
+@MISC{Bengio-1999,
+title = "Learning from Structured High-Dimensional Data",
+author = "Yoshua Bengio",
+howpublished = "Meeting of the Mathematical Society of Canada",
+location = "Montreal, Canada",
+year = "1999",
+}
+
+@MISC{Bengio+al-1999,
+title = "Gradient-Based Learning of Hyper-Parameters",
+author = "Yoshua Bengio and S. Latendresse and Charles Dugas",
+year = "1999",
+howpublished = "Learning Conference, Snowbird",
+location = "Utah",
+}
+
+@MISC{Bengio+al-1999b,
+title = "Learning Algorithms for Sorting Compounds from Titration Curves",
+author = "Yoshua Bengio and J-J. Brault and F. Major and R. Neal and S. Pigeon",
+howpublished = "Symposium on New Perspectives for Computer-Aided Drug Design",
+location = "Montreal, Canada",
+year = "1999",
+}
+
+@MISC{Bengio+al-1998,
+title = "Stochastic learning of strategic equilibria for auctions",
+author = "Yoshua Bengio and S. Latendresse and Charles Dugas",
+howpublished = "Machines That Learn Conference, Snowbird",
+location = "Utah",
+year = "1998",
+}
+
+@MISC{Bengio+al-1997,
+title = "On the Clusterization of Probabilistic Transducers",
+author = "Bengio, Y. and Bengio, S. and Singer, Y. and Isabelle, J-F.",
+howpublished = "1997 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1997",
+}
+
+@MISC{Bengio-1995,
+title = "Fast High Capacity Classifiers",
+author = "Bengio, Y. and Bengio, S. and Singer, Y. and Isabelle, J-F.",
+howpublished = "1995 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1997",
+}
+
+@MISC{Bengio+Frasconi-1994,
+title = "Réseaux de neurones Markoviens pour l'inférence grammaticale",
+author = "Bengio, Y. and Frasconi, P.",
+howpublished = "1994 ACFAS Conference, neural networks colloquium",
+location = "Montréal, Québec",
+year = "1994",
+}
+
+@MISC{Bengio+LeCun-1994,
+title = "Reconnaissance de mots manuscrits avec réseaux de neurones et modèles de Markov",
+author = "Bengio, Y. and {LeCun}, Y.",
+howpublished = "1994 ACFAS Conference, neural networks colloquium",
+location = "Montréal, Québec",
+year = "1994",
+}
+
+@MISC{Bengio+al-1994,
+title = "Optimisation d'une règle d'apprentissage pour réseaux de neurones artificiels",
+author = "Bengio, S. and Bengio, Y. and Cloutier, J. and Gecsei, J.",
+howpublished = "1994 ACFAS Conference, neural networks colloquium",
+location = "Montréal, Québec",
+year = "1994",
+}
+
+@MISC{Bengio+Frasconi-1994b,
+title = "An {EM} Algorithm for Target Propagation",
+author = "Bengio, Y. and Frasconi P.",
+howpublished = "1994 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1994",
+}
+
+@MISC{Bengio+al-1993,
+title = "The Problem of Learning Long-Term Dependencies in Recurrent Networks",
+author = "Bengio, Y. and Simard, P. and Frasconi P.",
+howpublished = "1994 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1993",
+}
+@MISC{Bengio-1992,
+title = "Representations Based on Articulatory Dynamics for Speech Recognition",
+author = "Bengio, Y.",
+howpublished = "1992 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1992",
+}
+
+@MISC{Bengio+al-1991,
+title = "Learning a Synaptic Learning Rule",
+author = "Bengio, Y. and Bengio, S. and Cloutier, J.",
+howpublished = "1991 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1991",
+}
+
+@MISC{Bengio+DeMori-1990,
+title = "Recurrent networks with Radial Basis Functions for speech recognition",
+author = "Bengio, Y. and De Mori, R.",
+howpublished = "1990 Neural Networks for Computing Conference, Snowbird",
+location = "Utah",
+year = "1991",
+}
+
+
+%%tech repport
+@TechReport{Bardou+Bengio-TR2002,
+  author =       "O. Bardou and Yoshua Bengio",
+  title =        "Régularisation du prix des option : Stacking",
+  institution =  "Cahier Scientifique Cirano 2002s-44",
+  year =         "2002",
+}
+
+@TechReport{Dugas+Bengio-TR2002,
+  author =       "O. Bardou and Yoshua Bengio",
+  title =        "Étude du biais dans le prix des options",
+  institution =  "Cahier Scientifique Cirano 2002s-45",
+  year =         "2002",
+}
+
+@TechReport{Dugas+al-TR2002,
+  author =       "C. Dugas and Y. Bengio and F. Bélisle and C. Nadeau and R. Garcia",
+  title =        "Incorporating Second-Order Functional Knowledge for Better Option Pricing",
+  institution =  "Cahier Scientifique Cirano 2002s-46",
+  year =         "2002",
+}
+
+@TechReport{Bengio+al-TR2002,
+  author =       "Y. Bengio and V.-P. Lauzon and R. Ducharme",
+  title =        "Experiments on the Application of IOHMMs to Model Financial Returns Series",
+  institution =  "Cahier Scientifique Cirano 2002s-47",
+  year =         "2002",
+}
+
+@TechReport{Bengio+al-TR2002b,
+  author =       "Y. Bengio and R. Ducharme and O. Bardou and N. Chapados",
+  title =        "Valorisation d'options par optimisation du Sharpe Ratio",
+  institution =  "Cahier Scientifique Cirano 2002s-48",
+  year =         "2002",
+}
+
+@TechReport{Chapados+Bengio-TR2002,
+  author =       "N. Chapados and Y. Bengio",
+  title =        "Cost Functions and Model Combination for VaR-based Asset Allocation using
+ Neural Networks",
+  institution =  "Cahier Scientifique Cirano 2002s-49",
+  year =         "2002",
+}
+
+@TechReport{Bengio+Dugas-TR2002,
+  author =       "Y. Bengio and C. Dugas",
+  title =        "Forecasting Non-Stationary Volatility with Hyper-Parameters",
+  institution =  "Cahier Scientifique Cirano 2002s-50",
+  year =         "2002",
+}
+
+@TechReport{Gingras+al-TR2002,
+  author =       "F. Gingras and Y. Bengio and C. Nadeau",
+  title =        "On Out-of-Sample Statistics for Time-Series",
+  institution =  "Cahier Scientifique Cirano 2002s-51",
+  year =         "2002",
+}
+
+@TechReport{Chapados+Bengio-TR2002b,
+  author =       "N. Chapados and Y. Bengio",
+  title =        "Input Decay : Simple and Effective Soft Variable Selection",
+  institution =  "Cahier Scientifique Cirano 2002s-52",
+  year =         "2002",
+}
+
+@TechReport{Ghosn+Bengio-TR2002,
+  author =       "J. Ghosn and Y. Bengio",
+  title =        "Multi-Task Learning For Option Pricing",
+  institution =  "Cahier Scientifique Cirano 2002s-53",
+  year =         "2002",
+}
+
+@TechReport{Collobert+al-TR2001,
+  author =       "J. Ghosn and Y. Bengio",
+  title =        "A Parallel Mixture of {SVM}s for Very Large Scale Problems",
+  institution =  "IDIAP",
+  location =     "Switzerland",
+  number =       "IDIAP-RR-01-12",
+  year =         "2001",
+}
+
+@TechReport{Vincent+Bengio-TR2001,
+  author =       "Vincent, P. and Bengio, Y.",
+  title =        "K-Local Hyperplane and Convex Distance Nearest Neighbor Algorithms",
+  institution =  DIRO,
+  location =     "Switzerland",
+  number =       "1197",
+  year =         "2001",
+}
+
+@TechReport{Chapados+al-TR2001,
+  author =       "Chapados, N. and Bengio, Y. and Vincent, P. and Ghosn, J. and Dugas, C. and Takeuchi, I. and Meng, L.",
+  title =        "Estimating Car Insurance Premia : a Case Study in High-Dimensional Data Inference",
+  institution =  DIRO,
+  number =       "1199",
+  year =         "2001",
+}
+
+@TechReport{Bengio+Chapados-TR2001,
+  author =       "Chapados, N. and Bengio, Y. and Vincent, P. and Ghosn, J. and Dugas, C. and Takeuchi, I. and Meng, L.",
+  title =        "Extending Metric-Based Model Selection and Regularization in the Absence of Unlabeled Data",
+  institution =  DIRO,
+  number =       "1200",
+  year =         "2001",
+}
+
+@TechReport{Nadeau+Bengio-TR1999,
+  author =       "Nadeau, C. and Bengio, Y.",
+  title =        "Inference and the Generalization Error",
+  institution =  "Cahier Scientifique Cirano 99s-25",
+  year =         "2002",
+}
+
+@TechReport{Gingras+al-TR1999,
+  author =       "Gingras, F. and Bengio, Y. and Nadeau, C.",
+  title =        "On Out-of-Sample Statistics for Financial Time-Series",
+  institution =  "Centre de Recherches Mathématiques, Université de Montreal",
+  number =       "2585",
+  year =         "1999",
+}
+
+@TechReport{Bengio-1998-TR,
+  author =       "Bengio, Y.",
+  title =        "Using a financial training criterion rather than a prediction criterion",
+  institution =  "Cahier Scientifique Cirano 98s-21",
+  year =         "1998",
+}
+
+@TechReport{Bengio+DeMori-1990-TR,
+  author =       "Bengio, Y. and De Mori, R.",
+  title =        "Some connectionist models and their application to speech recognition",
+  institution =  "School of Computer Science, McGill University",
+  number =       "TR-SOCS-90-12",
+  year =         "1990",
+}
+
+@article{becker+hinton:1993,
+    author = {Becker, S. and Hinton, G. E.},
+    title=  {Learning Mixture Models of Spatial Coherence},
+    journal={Neural Computation},
+    volume={5},
+    pages={267--277},
+    year={1993}
+}
+@article{berkes:2005,
+    author = {Berkes, Pietro and Wiskott, Laurenz},
+    title = {Slow Feature Analysis Yields a Rich Repertoire of Complex Cell Properties},
+    journal = {Journal of Vision},
+    ISSN = {1534-7362},
+    volume = {5},
+    number = {6},
+    pages = {579-602},
+    year = {2005},
+    month = {7},
+    URL = {http://journalofvision.org/5/6/9/},
+    eprint = {http://journalofvision.org/5/6/9/Berkes-2005-jov-5-6-9.pdf},
+}
+@inproceedings{hurri+hyvarinen:2003,
+    author={Hurri, J. and Hyv{\"a}rinen, A.},
+    title={Temporal Coherence, Natural Image Sequences, and the Visual Cortex.},
+    booktitle={Advances in Neural Information Processing Systems 15
+        ({NIPS*02})},
+    year={2003},
+    pages={141--148},
+}
+@article{wiskott:2002,
+    author =       "Laurenz Wiskott and Terrence Sejnowski",
+    year =         "2002",
+    title = {Slow Feature Analysis: Unsupervised Learning of Invariances},
+    journal =      "Neural Computation",
+    volume =       "14",
+    number =       "4",
+    pages =        "715--770",
+    url= {http://itb.biologie.hu-berlin.de/~wiskott/Publications/WisSej2002-LearningInvariances-NC.ps.gz},
+}
+
+@article{KouhPoggio2008,
+    author={Minjoon M. Kouh and Tomaso T. Poggio},
+    title={A Canonical Neural Circuit for Cortical Nonlinear Operations},
+    journal={Neural Computation},
+    volume={20},
+    number={6},
+    year={2008},
+    pages={1427-51},
+}
+@article{NykampRingach2002,
+    author={D. Q. Nykamp and D. L. Ringach},
+    title ={Full Identification of a Linear-Nonlinear System via Cross-Correlation Analysis},
+    journal = {Journal of Vision},
+    volume={2},
+    pages={1-11},
+    year={2002},
+}
+@incollection{cadieu+olshausen:2009,
+     title = {Learning Transformational Invariants from Natural Movies},
+      author = {Charles Cadieu and Bruno Olshausen},
+       booktitle = {Advances in Neural Information Processing Systems 21},
+        editor = {D. Koller and D. Schuurmans and Y. Bengio and L. Bottou},
+         pages = {209--216},
+          year = {2009},
+     publisher = {MIT Press}
+}
+@book{DayanAbbott2001,
+    author={Peter Dayan and L. F. Abbott},
+    title = {Theoretical Neuroscience},
+    publisher = {The {MIT} Press},
+    year = 2001,
+}
+
+@inproceedings{Chechik-MIR2008,
+ author = {G. Chechik and E. Ie and M. Rehn and S. Bengio and D. Lyon},
+ title = {Large-scale content-based audio retrieval from text queries},
+ booktitle = {ACM International Conference on Multimedia Information Retrieval (MIR'08)},
+ year = 2008,
+}
+
+@inproceedings{Bai-ECIR2009,
+ author = {B. Bai and J. Weston and R. Collobert and D. Grangier},
+ title = {Supervised Semantic Indexing},
+ booktitle = { European Conference on Information Retrieval (ECIR'09)},
+ year = 2009,
+}
+
+@article{Attwell+Laughlin-2001,
+ author = {David Attwell and Simon B. Laughlin},
+ title = {An energy budget for signaling in the grey matter of the brain},
+ journal = {Journal of Cerebral Blood Flow And Metabolism},
+ year =2001,
+ volume = 21,
+ pages = {1133--1145},
+}
+
+@article{Lennie-2003,
+ author = {Peter Lennie},
+ title = {The cost of cortical computation},
+ journal = {Current Biology},
+ year = 2003,
+ month = {Mar 18},
+ volume = {13},
+ number = 6,
+ pages = {493--497},
+}
+
+@inproceedings{LowdD2005,
+ author = {Lowd, Daniel and Domingos, Pedro},
+ title = {Naive Bayes models for probability estimation},
+ booktitle = ICML05,
+ editor = ICML05ed,
+ year = {2005},
+ pages = {529--536},
+ location = {Bonn, Germany},
+ publisher = ICML05publ,
+ address = {New York, NY, USA},
+ }
+
+@incollection{NairV2009,
+ title = {Implicit Mixtures of Restricted Boltzmann Machines},
+ author = {Vinod Nair and Geoffrey E Hinton},
+ booktitle = NIPS21,
+ editor = NIPS21ed,
+ publisher = NIPS21publ,
+ pages = {1145--1152},
+ year = {2009}
+}
+
+@incollection{Goodfellow2009,
+ title = {Measuring Invariances in Deep Networks},
+ author = {Ian Goodfellow and Quoc Le and Andrew Saxe and Andrew Ng},
+ booktitle = NIPS22,
+ editor = NIPS22ed,
+ pages = {646--654},
+ year = {2009}
+}
+
+@incollection{Xiao2009,
+ title = {Dual Averaging Method for Regularized Stochastic Learning and Online Optimization},
+ author = {Lin Xiao},
+ booktitle = {Advances in Neural Information Processing Systems 22},
+ editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I. Williams and A. Culotta},
+ pages = {2116--2124},
+ year = {2009}
+}
+
+@incollection{Kwok2009,
+ title = {Accelerated Gradient Methods for Stochastic Optimization and Online Learning},
+ author = {Chonghai Hu and James Kwok and Weike Pan},
+ booktitle = {Advances in Neural Information Processing Systems 22},
+ editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I. Williams and A. Culotta},
+ pages = {781--789},
+ year = {2009}
+}
+
+@article{Nesterov83,
+ author = {Yu Nesterov},
+ title = {A method for unconstrained convex minimization problem with the rate of convergence $o(1/k^2)$}, 
+ journal = {Doklady AN SSSR (translated as Soviet. Math. Docl.)}, 
+ volume = 269,
+ pages = {543--547}, 
+ year = 1983,
+}
+
+@incollection{Bai2009,
+ title = {Polynomial Semantic Indexing},
+ author = {Bing Bai and Jason Weston and David Grangier and Ronan Collobert and Kunihiko Sadamasa and Yanjun Qi and Corinna Cortes and Mehryar Mohri},
+ booktitle = {Advances in Neural Information Processing Systems 22},
+ editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C.K.I. Williams and A. Culotta},
+ pages = {64--72},
+ year = {2009}
+}
+
+@incollection{Chechik2009,
+ title = {An Online Algorithm for Large Scale Image Similarity Learning},
+ author = {Gal Chechik and Uri Shalit and Varun Sharma and Samy Bengio},
+ booktitle = {Advances in Neural Information Processing Systems 22},
+ editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I. Williams and A. Culotta},
+ pages = {306--314},
+ year = {2009}
+}
+
+@incollection{Klampfl+Maass-2009,
+ title = {Replacing supervised classification learning by Slow Feature Analysis in spiking neural networks},
+ author = {Stefan Klampfl and Wolfgang Maass},
+ booktitle = NIPS22,
+ editor = NIPS22ed,
+ pages = {988--996},
+ year = {2009}
+}
+
+
+
+@Article{GrandvaletCanuBoucheron97,
+  author =       "Yves Grandvalet and Stéphane Canu and Stéphane Boucheron",
+  title =        "Noise Injection: Theoretical Prospects",
+  journal =      "Neural Computation",
+  volume =       "9",
+  number =       "5",
+  pages =        "1093--1108",
+  year =         "1997",
+}
+
+@Article{SietsmaDow91,
+  author =       "J. Sietsma and R. Dow",
+  title =        "Creating artificial neural networks that generalize",
+  journal =      "Neural Networks",
+  volume =       "4",
+  number =       "1",
+  pages =        "67--79",
+  year =         "1991",
+}
+
+@Article{HolmstromKoistinen92,
+  author =       "Lasse Holmström and Petri Koistinen",
+  title =        "Using additive noise in back-propagation training",
+  journal =      "{IEEE} Transactions on Neural Networks",
+  volume =       "3",
+  number =       "1",
+  pages =        "24--38",
+  year =         "1992",
+}
+
+@inproceedings{Baird90,
+    author = "H. Baird",
+    title = {Document image defect models},
+    year = 1990,
+    booktitle = "IAPR Workshop on Syntactic and Structural Pattern Recognition",
+    pages = "38--46",
+    address = "Murray Hill, NJ."
+}
+
+@TechReport{Poggio+Vetter92,
+  author =       "T. Poggio and T. Vetter",
+  title =        "Recognition and structure from one 2D model view: Observations on prototypes, object classes and symmetries",
+  number =       "A.I. Memo No. 1347",
+  institution =  "Artificial Intelligence Laboratory, Massachusetts Institute of Technology",
+  year =         "1992",
+}
+
+@INPROCEEDINGS{Scholkopf96invariances,
+    author = {Bernhard Sch{\"o}lkopf and Chris Burges and Vladimir Vapnik},
+    title = {Incorporating Invariances in Support Vector Learning Machines},
+    booktitle = {Lecture Notes in Computer Science (Vol 112), Artificial Neural Netowrks ICANN'96},
+    year = {1996},
+    editor = {C. von der Malsburg and W. von Seelen and J. C. Vorbrüggen and B. Sendhoff},
+    pages = {47--52},
+    publisher = {Springer}
+}
+
+@inproceedings{Cho+Saul09,
+ title = {Kernel Methods for Deep Learning},
+ author = {Youngmin Cho and Lawrence Saul},
+ booktitle = NIPS22,
+ editor = NIPS22ed,
+ pages = {342--350},
+ year = {2010},
+ publisher = {NIPS Foundation},
+}
+
+
+@InProceedings{Linsker89,
+  author =       "R. Linsker",
+  editor =       NIPS1ed,
+  booktitle =    NIPS1,
+  title =        "An application of the principle of maximum information 
+preservation to linear systems",
+  publisher =    NIPS1publ,
+  year =         "1989",
+}
+
+@Article{An96AddingNoise,
+  author =       "Guozhong An",
+  title =        "The effects of adding noise during backpropagation training on a generalization performance",
+  journal =      "Neural Computation",
+  volume =       "8",
+  number =       "3",
+  pages =        "643--674",
+  year =         "1996",
+}
+
+@article{DruckerLeCun92,
+	author = {Harris Drucker and Yann LeCun},
+	title = {Improving generalisation performance using double back-propagation.},
+	journal = {IEEE Transactions on Neural Networks},
+	number = {6},
+	pages = {991--997},
+	volume = {3},
+	year = {1992}
+}
+
+@Article{BellSejnowski-97,
+  author =       "A. Bell and T. J. Sejnowski",
+  title =        "The independent components of natural scenes are edge filters",
+  journal =      "Vision Research",
+  volume =       "37",
+  pages =        "3327--3338",
+  year =         "1997",
+}
+
+
+@Article{Dokur1997,
+  author =       {Z\:{u}mray Dokur, Tamer \:{O}lmez, Ertugrul Yazgan, Okan K. Ersoy},
+  title =        {Detection of {ECG} waveforms by neural networks},
+  journal =      {Medical engineering & physics},
+  year =         {1997},
+  volume =    {19},
+  number =    {8},
+  pages =     {738--741},
+  month =     {October},
+}
+
+@Article{Hu1993,
+  author =       {Y. H. Hu and W. J. Tompkins and J. L. Urrusti and V. X. Afonso},
+  title =        {Applications of artificial neural networks for {ECG} signal detection and classification},
+  journal =      JEC,
+  year =         {1993},
+  volume =    {26s},
+  pages =     {66--73},
+}
+
+@Article{Unser1996,
+author = {M. Unser and A. Aldroubi},
+title = {A Review of Wavelets in Biomedical Applications},
+journal = {Proceedings of the {IEEE}},
+year = {1996},
+volume= {84},
+number= {4},
+pages = {626--638},
+month = {April},
+}
+
+@inproceedings{Povey+Woodland-2002,
+ author = {D. Povley and P.C. Woodland},
+ title = {Minimum error and {I}-smoothing for improved discriminative training},
+ booktile = {Proceedings of the International Conference on Acoustics,
+Speech, and Signal Processing (ICASSP'2002)},
+ publisher = {IEEE},
+ volume = 1,
+ pages = {I-105--I-108},
+ address = {Orlando, Florida, USA},
+}
+
+@incollection{Susskind2008,
+ author = {Joshua M. Susskind and Geoffrey E. and Javier R. Movellan and Adam K. Anderson},
+ title = {Generating Facial Expressions with Deep Belief Nets},
+ editor = {V. Kordic},
+ booktitle = {Affective Computing, Emotion Modelling, Synthesis and Recognition},
+ publisher = {ARS Publishers},
+ year = 2008,
+ pages = {421--440},
+}
+
+@InCollection{Li2005,
+  author =       {Peng Li and Kap Luk Chan and Sheng Fu and S.M. Krishnan},
+  title =        {An Abnormal {ECG} Beat Detection Approach for Long-Term Monitoring of Heart Patients Based on Hybrid Kernel Machine Ensemble},
+  booktitle =    {Multiple Classifier Systems},
+  pages =     {346-355},
+  publisher = {Springer},
+  year =      {2005},
+  volume =    {3541/2005},
+  series =    {Lecture Notes in Computer Science},
+  address =   {Berlin / Heidelberg},
+}
+
+@incollection {Hughes_NIPS2003,
+  author = " Nicholas P. Hughes and  Lionel Tarassenko and  Stephen J. Roberts",
+  title = " Markov Models for Automated {ECG} Interval Analysis",
+  booktitle = NIPS16,
+  editor = NIPS16ed,
+  publisher = NIPS16publ,
+  address = NIPS16addr,
+  year = "2004",
+  keywords = "hidden Markov models, Markov models, wavelets, segmentation, probabilistic models, biomedical signal processing, time series",
+  }
+
+@inproceedings{Salem2009,
+ author = {Abdel-Badeeh M. Salem and Kenneth Revett and El-Sayed A. El-Dahshan},
+ title = {Machine Learning in Electrocardiogram Diagnosis},
+ booktitle = {Proceedings of the International Multiconference on Computer Science and Information Technology},
+ volume = 4,
+ pages = {429--433},
+ year = 2009,
+ publisher = {IEEE},
+}
+
+@book{Clifford2006,
+ author = {G.D. Clifford and F. Azuaje and P.E. McSharry}, 
+ title = {Advanced Methods and Tools for {ECG} Analysis},
+ publisher = {Artech House Publishing},
+ year = 2006,
+}
+
+@inproceedings{Lin2009,
+  author = {Lin, Jessica and Li, Yuan}, 
+  title = {Finding Structural Similarity in Time Series Data Using Bag-of-Patterns Representation},
+  booktitle = {SSDBM 2009: Proceedings of the 21st International Conference on Scientific and Statistical Database Management},
+  year = {2009},
+  isbn = {978-3-642-02278-4},
+  pages = {461--477},
+  location = {New Orleans, LA, USA},
+  doi = {http://dx.doi.org/10.1007/978-3-642-02279-1_33},
+  publisher = {Springer-Verlag},
+  address = {Berlin, Heidelberg},
+ }
+
+@article{Froese2006,
+ author = {Froese, Tom and Hadjiloucas, Sillas and Galv\,
+{a}o, Roberto K. H. and Becerra, Victor M. and Coelho, Clarimar Jos\'{e}},
+ title = {Comparison of extrasystolic {ECG} signal classifiers using discrete wavelet transforms},
+ journal = {Pattern Recogn. Lett.},
+ volume = {27},
+ number = {5},
+ year = {2006},
+ issn = {0167-8655},
+ pages = {393--407},
+ doi = {http://dx.doi.org/10.1016/j.patrec.2005.09.002},
+ publisher = {Elsevier Science Inc.},
+ address = {New York, NY, USA},
+ }
+
+@Article{Crowe1992,
+  author =   {J. A. Crowe and N. M. Gibson and M. S. Woolfson and M. G. Somekh},
+  title =    {Wavelet transform as a potential tool for {ECG} analysis and compression},
+  journal =  {Journal of Biomedical Engineering},
+  year =     {1992},
+  volume =   {14},
+  number =   {3},
+  pages =    {268--272},
+  month =    {May},
+}
+
+@ARTICLE{Hilton1997,
+    author = {Michael Hilton},
+    title = {Wavelet and Wavelet Packet Compression of Electrocardiograms},
+    journal = IEEE_trans_biomed,
+    year = {1997},
+    volume = {44},
+    pages = {394--402}
+}
+
+@Article{Li1995,
+  author =       {C. Li and C. Zheng and C. Tai},
+  title =        {Detection of {ECG} characteristic points using wavelet transforms},
+  journal =     IEEE_trans_biomed,
+  year =        {1995},
+  volume =    {42},
+  number =    {1},
+  pages =     {21--28},
+  month =     {January},
+}
+
+@article{Polat2007,
+title = {Detection of {ECG} Arrhythmia using a differential expert system approach based on principal component analysis and least square support vector machine},
+journal = {Applied Mathematics and Computation},
+volume = {186},
+number = {1},
+pages = {898--906},
+year = {2007},
+issn = {0096-3003},
+doi = {DOI: 10.1016/j.amc.2006.08.020},
+url = {http://www.sciencedirect.com/science/article/B6TY8-4KXDWBF-5/2/a9e1d7e2dfc4c88935386ea04ca9cb94},
+author = {Kemal Polat and Salih G\"{u}nes},
+keywords = {ECG Arrhythmia},
+keywords = {Principal component analysis (PCA)},
+keywords = {Least square support vector machine (LSSVM)},
+keywords = {ROC curves},
+}
+
+@article{Song2005,
+  author =       {Mi Hye Song and Jeon Lee and Sung Pil Cho and Kyoung Joung Lee and Sun Kook Yoo},
+  title =        {Support Vector Machine Based Arrhythmia Classification  
+Using Reduced Features},
+  journal =      IJCAS,
+  year =         {2005},
+  volume =    {3},
+  number =    {4},
+  pages =     {571--579},
+  month =     {December},
+}
+
+@article{Ubeyli2009,
+ author = {Elif Derya \"{U}beyli},
+ title = {Combining recurrent neural networks with eigenvector methods for classification of {ECG} beats},
+ journal = DSP,
+ volume = {19},
+ number = {2},
+ year = {2009},
+ issn = {1051-2004},
+ pages = {320--329},
+ doi = {http://dx.doi.org/10.1016/j.dsp.2008.09.002},
+ publisher = {Academic Press, Inc.},
+ address = {Orlando, FL, USA},
+ }
+
+@article{Ubeyli2007,
+  author =       {Elif Derya \"{U}beyli},
+  title =        {{ECG} beats classification using multiclass support vector machines with error correcting output codes},
+  journal =      DSP,
+  year =         {2007},
+  volume =    {17},
+  pages =     {675--684},
+}
+
+@Article{Soman2005,
+  author =    {T. Soman and P. O. Bobbie},
+  title =     {Classification of Arrhythmia Using Machine Learning Techniques},
+  journal =   {WSEAS Transactions on Computers},
+  year =      {2005},
+  volume =    {4},
+  number =    {6},
+  pages =     {548--552},
+  month =     {June},
+}
+
+@InProceedings{Chengwei2006,
+  author =       {Li Chengwei and Wang Shoubin and Xu Aijun and Peng Hui},
+  title =        {Clinical Diagnosis of Cardiac Disease Based on Support Vector Machine},
+  booktitle = {World Congress on Medical Physics and Biomedical Engineering},
+  pages =     {1273--1276},
+  year =      {2006},
+  editor =    {R. Magjarevic and J. H. Nagel},
+  volume =    {14},
+  series =    {IFMBE Proceedings},
+  publisher = {Springer Berlin Heidelberg},
+}
+
+@Article{Chiu2005,
+  author =       {Chuang-Chien Chiu and Tong-Hong Lin and Ben-Yi Liau},
+  title =        {Using correlation coefficient in {ECG} waveform for arrhythmia detection},
+  journal =      BME,
+  year =         {2005},
+  volume =    {17},
+  number =    {3},
+  pages =     {147--152},
+  month =     {June},
+}
+
+@Article{Silipo1998,
+  author =       {Rosaria Silipo and Carlo Marchesi},
+  title =        {Artificial Neural Networks for Automatic {ECG} Analysis},
+  journal =      IEEE_trans_SP,
+  year =         {1998},
+  volume =    {46},
+  number =    {5},
+  pages =     {1417--1425},
+  month =     {May},
+}
+
+@Article{Osowski2004,
+  author =       {Stanislaw Osowski and Linh Tran Hoai and Tomasz Markiewicz},
+  title =        {Support Vector Machine-Based Expert System for 
+Reliable Heartbeat Recognition},
+  journal =      IEEE_trans_biomed,
+  year =         {2004},
+  volume =    {51},
+  number =    {4},
+  pages =     {582--589},
+  month =     {April},
+}
+
+@article{PhysioNet,
+ author = PhysioNetAuthors,
+ title = "{PhysioBank, PhysioToolkit, and PhysioNet}: Components of a New
+	  Research Resource for Complex Physiologic Signals",
+ journal = "Circulation",
+ year = PhysioNetYear,
+ volume = "101",
+ number = "23",
+ pages = "e215--e220",
+ note = PhysioNetNote,
+}
+
+@article{Lin2007,
+    author = {Lin, Jessica and Keogh, Eamonn and Wei, Li and Lonardi, Stefano},
+    citeulike-article-id = {2821475},
+    citeulike-linkout-0 = {http://dblp.uni-trier.de/rec/bibtex/journals/datamine/LinKWL07},
+    citeulike-linkout-1 = {http://dx.doi.org/10.1007/s10618-007-0064-z},
+    citeulike-linkout-2 = {http://www.springerlink.com/content/g69808822l82t325},
+    day = {18},
+    doi = {10.1007/s10618-007-0064-z},
+    journal = DMKD,
+    keywords = {simulation},
+    month = {October},
+    number = {2},
+    pages = {107--144},
+    posted-at = {2008-05-21 23:56:04},
+    priority = {2},
+    title = {Experiencing SAX: a novel symbolic representation of time series},
+    url = {http://dx.doi.org/10.1007/s10618-007-0064-z},
+    volume = {15},
+    year = {2007}
+}
+
+@inproceedings{Lin2010,
+  author = {Lin, Jessica and Li, Yuan},
+  title = {Finding Structural Similarity in Time Series Data Using Bag-of-Patterns Representation},
+  booktitle = SSDBM2009, 
+  year = {2009},
+  isbn = {978-3-642-02278-4},
+  pages = {461--477},
+  location = {New Orleans, LA, USA},
+  doi = {http://dx.doi.org/10.1007/978-3-642-02279-1_33},
+  publisher = {Springer-Verlag},
+  address = {Berlin, Heidelberg},
+ }
+
+@Article{Ham1996,
+  author =       {F. M. Ham and Soowhan Han},
+  title =        {Classification of cardiac arrhythmias using fuzzy ARTMAP},
+  journal =      IEEE_trans_biomed,
+  year =         {1996},
+  volume =    {43},
+  number =    {4},
+  pages =     {425--429},
+  month =     {April},
+}
+@article{Engin2004,
+  title = "ECG beat classification using neuro-fuzzy network",
+  journal = PRL,
+  volume = "25",
+  number = "15",
+  pages = "1715 - 1722",
+  year = "2004",
+  issn = "0167-8655",
+  doi = "DOI: 10.1016/j.patrec.2004.06.014",
+  url = "http://www.sciencedirect.com/science/article/B6V15-4D0Y5TH-2/2/b83f364f61d79f96abeb1bc1b1898ab9",
+  author = "Mehmet Engin",
+  keywords = "ECG beat classification",
+  keywords = "MIT/BIH database",
+  keywords = "Neuro-fuzzy networks",
+  keywords = "Higher-order statistics",
+  keywords = "Wavelet transform",
+  keywords = "AR modelling",
+  keywords = "Pattern recognition"
+}
+
+@article{Turaga2010,
+ author = {S. C. Turaga and J. F. Murray and V. Jain and F. Roth and M. Helmstaedter and K. Briggman and W. Denk and H. S. Seung}, 
+ title = {Convolutional networks can learn to generate affinity graphs for image segmentation}, 
+ journal = {Neural Computation}, 
+ volume = 22, 
+ pages = {511--538},
+ year = 2010,
+}
+
+@article{Hahnloser-2003,
+ author = {Richard H.R. Hahnloser and H. Sebastian Seung and J.J. Slotine},
+ title = {Permitted and forbidden sets in symmetric threshold-linear networks},
+ journal = {Neural Computation},
+ volume = 15,
+ pages = {621--638},
+ year = 2003,
+}
+
+@techreport{Jenatton-2009,
+ title={Structured Variable Selection with Sparsity-Inducing Norms},
+ author={Jenatton, R. and Audibert, J.-Y. and Bach, F.},
+ institution={arXiv:0904.3523},
+ year={2009}
+}
+
+@ARTICLE{Erhan2010,
+    author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
+     month = feb,
+     title = {Why Does Unsupervised Pre-training Help Deep Learning?},
+   journal = jmlr,
+    volume = {11},
+      year = {2010},
+     pages = {625--660},
+  abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
+}
+
+@ARTICLE{Bengio2009FTML,
+    author = {Bengio, Yoshua},
+     title = {Learning deep architectures for {AI}},
+   journal = FTML,
+    volume = {2},
+    number = {1},
+      year = {2009},
+     pages = {1--127},
+      note = Bengio2009FTML_note,
+  abstract = {Theoretical results suggest that in order to learn the kind of
+complicated functions that can represent high-level abstractions (e.g. in
+vision, language, and other AI-level tasks), one may need {\insist deep
+architectures}. Deep architectures are composed of multiple levels of non-linear
+operations, such as in neural nets with many hidden layers or in complicated
+propositional formulae re-using many sub-formulae. Searching the
+parameter space of deep architectures is a difficult task, but
+learning algorithms such as those for Deep Belief Networks have recently been proposed
+to tackle this problem with notable success, beating the state-of-the-art
+in certain areas. This paper discusses the motivations and principles regarding 
+learning algorithms for deep architectures,  in particular those exploiting as
+building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines,
+used to construct deeper models such as Deep Belief Networks.}
+}
+
+@ARTICLE{Bengio1994ITNN,
+    author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
+     title = {Learning Long-Term Dependencies with Gradient Descent is Difficult},
+   journal = IEEE_trans_NN,
+    volume = {5},
+    number = {2},
+      year = {1994},
+     pages = {157--166},
+  abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
+optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J},
+}
+
+@article{Kohler1992,
+    abstract = {The QRS complex is the most striking waveform within the electrocardiogram (ECG). Since it reflects the electrical activity within the heart during the ventricular contraction, the time of its occurrence as well as its shape provide much information about the current state of the heart. Due to its characteristic shape it serves as the basis for the automated determination of the heart rate, as an entry point for classification schemes of the cardiac cycle, and often it is also used in ECG data compression algorithms. In that sense, QRS detection provides the fundamentals for almost all automated ECG analysis algorithms. Software QRS detection has been a research topic for more than 30 years. The evolution of these algorithms clearly reflects the great advances in computer technology. Within the last decade many new approaches to QRS detection have been proposed; for example, algorithms from the field of artificial neural networks genetic algorithms wavelet transforms, filter banks as well as heuristic methods mostly based on nonlinear transforms. The authors provide an overview of these recent developments as well as of formerly proposed algorithms},
+    author = {Kohler, B. U. and Hennig, C. and Orglmeister, R.},
+    citeulike-article-id = {546409},
+    citeulike-linkout-0 = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=993193},
+    journal = eng_med_bio,
+    keywords = {detector, ecg\_processing, qrs, qt\_interval, review\_article, rr\_interval},
+    number = {1},
+    pages = {42--57},
+    posted-at = {2007-11-25 20:38:19},
+    priority = {2},
+    title = {The principles of software QRS detection},
+    url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=993193},
+    volume = {21},
+    year = {2002}
+}
+
+@article{Thomas2006,
+author = {Julien Thomas and Cedric Rose and Francois Charpillet},
+title = {A Multi-HMM Approach to ECG Segmentation},
+journal = ICTAI06, 
+volume = {0},
+issn = {1082-3409},
+year = {2006},
+pages = {609-616},
+doi = {http://doi.ieeecomputersociety.org/10.1109/ICTAI.2006.17},
+publisher = {IEEE Computer Society},
+address = {Los Alamitos, CA, USA},
+}
+
+@inproceedings{Cortes+al-2000,
+ author = {Juan Carlos P\'{e}rez-Cortes and Rafael Llobet and Joaquim Arlandis},
+ title = {Fast and Accurate Handwritten Character Recognition Using Approximate Nearest Neighbours Search on Large Databases},
+ booktitle = iapr,
+ year = {2000},
+ isbn = {3-540-67946-4},
+ pages = {767--776},
+ publisher = {Springer-Verlag},
+ address = {London, UK},
+ }
+
+
+@Article{Oliveira+al-2002,
+  author =       "Oliveira, L.S.  and  Sabourin, R.  and  Bortolozzi, F.  and  Suen, C.Y.",
+  title =        "Automatic recognition of handwritten numerical strings: a recognition and verification strategy",
+  journal =      ieeetpami,
+  volume =       "24",
+  number =       "11",
+  pages =        "1438-1454",
+  month =        nov,
+  year =         "2002",
+  doi  =         "10.1109/TPAMI.2002.1046154",
+  issn =         "0162-8828",
+}
+
+@inproceedings{SimardSP03,
+  author    = {Patrice Simard and
+               David Steinkraus and
+               John C. Platt},
+  title     = {Best Practices for Convolutional Neural Networks Applied
+               to Visual Document Analysis},
+  booktitle = {ICDAR},
+  year      = {2003},
+  pages     = {958-962},
+  ee        = {http://csdl.computer.org/comp/proceedings/icdar/2003/1960/02/196020958abs.htm},
+  crossref  = {DBLP:conf/icdar/2003},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@inproceedings{SimardSP03-short,
+  author    = {Patrice Simard and
+               David Steinkraus and
+               John C. Platt},
+  title     = {Best Practices for Convolutional Neural Networks Applied
+               to Visual Document Analysis},
+  booktitle = {ICDAR},
+  year      = {2003},
+  pages     = {958-962},
+}
+
+@inproceedings{Milgram+al-2005,
+  author = {Milgram, J. and Cheriet, M. and Sabourin, R.},
+  title = {Estimating accurate multi-class probabilities with support vector machines},
+  booktitle = {Int. Joint Conf. on Neural Networks},
+  year = {2005},
+  pages = {906--1911},
+  location = {Montreal, Canada},
+ }
+
+@proceedings{DBLP:conf/icdar/2003,
+  title     = {7th International Conference on Document Analysis and Recognition
+               (ICDAR 2003), 2-Volume Set, 3-6 August 2003, Edinburgh,
+               Scotland, UK},
+  booktitle = {ICDAR},
+  publisher = {IEEE Computer Society},
+  year      = {2003},
+  isbn      = {0-7695-1960-1},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+
+@article{Granger+al-2007,
+    author = {Eric Granger and Robert Sabourin and Luiz S. Oliveira and Catolica Parana},
+    title = {Supervised Learning of Fuzzy ARTMAP Neural Networks Through Particle Swarm Optimization},
+    journal = jprr,
+    year = {2007},
+    volume = "2",
+    number = "1",
+    pages = "27-60",
+}
+
+@inproceedings{SnowEtAl2008,
+    author = {Snow, R. and O'Connor, B. and Jurafsky, D. and Ng, A.},
+    booktitle = {Proc. Empirical Methods in NLP},
+    pages = {254--263},
+    title = {Cheap and Fast -- But is it Good? Evaluating Non-Expert Annotations for Natural Language Tasks},
+    year = {2008}
+}
+
+@TECHREPORT{Garris94+al-1994,
+    author = {Michael D. Garris and James L. Blue and Gerald T. Candela and Gerald T. C and Darrin L. Dimmick and Jon Geist and Patrick J. Grother and Stanley A. Janet and Charles L. Wilson},
+    title = {NIST Form-Based Handprint Recognition System},
+    institution = {Technical Report NISTIR 5469 and CD-ROM, National Institute of Standards and Technology},
+    year = {1994},
+    doi = {10.1.1.45.1560},
+}
+
+@inproceedings{SorokinAndForsyth2008,
+    author = {Sorokin, A. and Forsyth, D.},
+    booktitle = {CVPR Workshops},
+    pages = {1--8},
+    title = {Utility data annotation with Amazon Mechanical Turk},
+    year = {2008}
+}
+
+@inproceedings{Grother-1995,
+        AUTHOR = "Grother, P.J.",
+        TITLE = "Handprinted Forms and Character Database, NIST Special Database 19",
+        BOOKTITLE = "National Institute of Standards and Technology (NIST) Intelligent Systems Division (NISTIR)",
+        YEAR = "1995",
+        BIBSOURCE = "http://www.visionbib.com/bibliography/char1015.html#TT105853"}
+}
+
+@inproceedings{ whitehill09,
+ title = {Whose Vote Should Count More: Optimal Integration of Labels from Labelers of Unknown Expertise},
+ author = {J. Whitehill and P. Ruvolo and T. Wu and J. Bergsma and J. Movellan},
+ booktitle = {NIPS 22},
+ pages = {2035--2043},
+ year = 2009
+}
--- a/writeup/ml.bib	Tue Jun 01 12:12:52 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25811 +0,0 @@
-%%WARNING: READ THE README FILE BEFORE ANY MODIFICATION!!!
-
-
-%%submitted papers
-%%%
-
-@Article{Bergstra+Bengio+Louradoj-2008sub,
-  author =       "J. Bergstra and Y. Bengio and J. Louradour",
-  title =        "Suitability of Complex Cell Models for Object Categorization",
-  journal =      "Computational Neuroscience",
-  year =         "2008",
-  note =         "Rejected."
-}
-@Article{Bergstra+Bengio+Louradoj-2009sub,
-  author =       "J. Bergstra and Y. Bengio and J. Louradour",
-  title =        "Suitability of Complex Cell Models for Object Categorization",
-  journal =      "Neural Computation",
-  year =         "2009",
-  note =         "Submitted."
-}
-@Article{Chapados+Bengio-2008sub,
-  author =       "N. Chapados and Y. Bengio",
-  title =        "Forecasting and Trading Commodity Contract Spreads with {G}aussian Processes",
-  journal =      "International Journal of Forecasting",
-  year =         "2008",
-  note = "Submitted.",
-}
-@Article{Chapados+Bengio-2008sub2,
-  author =       "N. Chapados and Y. Bengio",
-  title =        "Training Graphs of Learning Modules for Sequential Data",
-  journal =      "ACM Transactions on Knowledge Discovery from Data",
-  year =         "2008",
-  note = "Submitted.",
-}
-
-%%%
-%%accepted or published papers
-%%%
-
-@Article{Grother,
-  author = "Grother Patrick J.",
-  title = "NIST special database. Handprinted forms and characters database",
-  publisher = "National institute of standards and technology",
-  year = "1995"
-}
-
-@InCollection{Trentin+al-2002,
-  author =       "E. Trentin and F. Brugnara and Y. Bengio and C. Furlanello and R.  De Mori",
-  editor =       "R. Daniloff",
-  booktitle =    "Connectionist Approaches to Clinical Problems in Speech
-and Language",
-  title =        "Statistical and Neural Network Models for Speech Recognition",
-  publisher =    "Lawrence Erlbaum",
-  pages =        "213--264",
-  year =         "2002",
-}
-
-@InCollection{Bengio+grandvalet-2004,
-  author =       "Y. Bengio and Y. Grandvalet",
-  editor =       "P. Duchesne and B. Remillard",
-  booktitle =    "Statistical Modeling and Analysis for Complex Data Problem",
-  title =        "Bias in Estimating the Variance of K-Fold Cross-Validation",
-  publisher =    "Lawrence Erlbaum",
-  address =      "Kluwer",
-  pages =        "75--95",
-  year =         "2004",
-}
-
-@InCollection{Dugas+al-2004,
-  author =       "C. Dugas and Y. Bengio and N. Chapados and P. Vincent and G. Denoncourt and C. Fournier",
-  editor =       "L. Jain and A.F. Shapiro",
-  booktitle =    "Intelligent and Other Computational Techniques in Insurance: Theory and
-Applications",
-  title =        "Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking",
-  publisher =    "World Scientific Publishing Company",
-  year =         "2004",
-}
-
-@InCollection{Dugas+al-2004-short,
-  author =       "C. Dugas and Y. Bengio and N. Chapados and P. Vincent and G. Denoncourt and C. Fournier",
-  booktitle =    "Intelligent and Other Computational Techniques in Insurance: Theory and
-Applications",
-  title =        "Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking",
-  publisher =    "World Scientific Publishing Company",
-  year =         "2004",
-}
-
-@inproceedings{Collobert+Bengio+Bengio-2002b,
-    author = "R. Collobert and Y. Bengio and S. Bengio",
-    title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
-    editor = "S.W. Lee and A. Verri",
-    year = 2002,
-    booktitle = SVM02,
-    volume = "2388 of Lecture Notes in Computer Science",
-    publisher = "Springer-Verlag",
-    pages = "8--23",
-}
-
-@Article{Collobert+Bengio+Bengio-2003,
-  author =       "R. Collobert and Y. Bengio and S. Bengio.",
-  title =        "Scaling Large Learning Problems with Hard Parallel Mixtures",
-  journal =      ijprai,
-  volume =       "17",
-  number =       "3",
-  pages =        "349--365",
-  year =         "2003",
-}
-
-@Article{Collobert+Bengio+Bengio-2003-small,
-  author =       "R. Collobert and Y. Bengio and S. Bengio.",
-  title =        "Scaling Large Learning Problems with Hard Parallel Mixtures",
-  journal =      "Int. J. Pattern Recognition and Artificial Intelligence",
-  volume =       "17(3)",
-  pages =        "349--365",
-  year =         "2003",
-}
-
-@InProceedings{Bengio+Chapados-2002,
-  author =       "Y. Bengio and N. Chapados",
-  title =        "Metric-based Model Selection for Time-Series Forecasting",
-  publisher =    "IEEE Press",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  year =         "2002",
-  pages = "13--24",
-}
-
-@InProceedings{Bengio+Takeuchi+Kanamori-2002,
-  author =       "Y. Bengio and I. Takeuchi and K. Kanamori",
-  title =        "The Challenge of Non-Linear Regression on Large Datasets with Asymmetric Heavy Tails",
-  publisher =    "American Statistical Association publ.",
-  booktitle =    JSM02,
-  year =         "2002",
-  pages = "193-205"
-}
-
-@InProceedings{Bengio+Takeuchi+Kanamori-2002-short,
-  author =       "Y. Bengio and I. Takeuchi and K. Kanamori",
-  title =        "The Challenge of Non-Linear Regression on Large Datasets with Asymmetric Heavy Tails",
-  booktitle =    JSM02,
-  year =         "2002",
-}
-
-@InProceedings{Collobert+Bengio+Bengio-2002,
-  author =       "R. Collobert ans S. Bengio and Y. Bengio",
-  title =        "A Parallel Mixture of {SVM}s for Very Large Scale Problems",
-  booktitle =    NIPS14,
-  editor =       NIPS14ed,
-  pages =        "633--640",
-  year =         "2002",
-}
-
-@InProceedings{Bhattacharya+Getoor+Bengio-2004,
-  author =       "I. Bhattacharya and L. Getoor and Y. Bengio",
-  booktitle =    "Conference of the Association for Computational Linguistics (ACL'04)",
-  title =        "Unsupervised Sense Disambiguation Using Bilingual Probabilistic Models",
-  year =         "2004",
-}
-@InProceedings{Boufaden+Bengio+Lapalme-2008,
-  author =       "N. Boufaden and Y. Bengio and G. Lapalme",
-  booktitle =    "{\em TALN'2004}, Traitement Automatique du Langage Naturel.",
-  title =        "Approche statistique pour le repérage de mots informatifs dans les textes oraux",
-  year =         "2004",
-}
-@InProceedings{Chapados+Bengio-2006,
-  author =       "N. Chapados and Y. Bengio",
-  booktitle =    AI06,
-  title =        "The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization",
-  pages =        "491-502",
-  year =         "2006",
-}
-@InProceedings{Rivest+Bengio+Kalaska-2005,
-  author =       "F. Rivest and Y. Bengio and J. Kalaska",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Brain Inspired Reinforcement Learning",
-  publisher =    "MIT Press, Cambridge",
-  address =      "Cambridge, MA",
-  pages =        "1129-1136",
-  year =         "2005",
-}
-
-@InProceedings{Bengio+Grandvalet-NIPS-2004,
-  author =       "Y. Bengio Y. and Y. Grandvalet",
-  editor =       NIPS16ed,
-  booktitle =    NIPS16,
-  title =        "No Unbiased Estimator of the Variance of K-Fold Cross-Validation",
-  publisher =    "MIT Press, Cambridge",
-  address =      "Cambridge, MA",
-  year =         "2004",
-}
-
-@InProceedings{Bengio+Grandvalet-NIPS-2004-short,
-  author =       "Y. Bengio Y. and Y. Grandvalet",
-  booktitle =    NIPS16,
-  title =        "No Unbiased Estimator of the Variance of K-Fold Cross-Validation",
-  publisher =    "MIT Press, Cambridge",
-  year =         "2004",
-}
-
-@article{Zaccaro-et-al-2005,
- author = {Maria Clara Zaccaro and Hong Boon Lee and Mookda Pattarawarapan and 
-           Zebin Xia and Antoine Caron and Pierre-Jean L'Heureux and Yoshua Bengio
-           and Kevin Burgess and H. Uri Saragovi},
- title = {Selective Small Molecule Peptidomimetic Ligands of {TrkC} and {TrkA} Receptors Afford Discrete or Complete Neurotrophic Activities},
- journal = {Chemistry \& Biology},
- volume = 12,
- number = 9,
- pages = {1015--1028},
- year = 2005,
-}
-
-@Article{63a:man,
-  author =       "B. Mandelbrot",
-  title =        "The variation of certain speculative prices",
-  journal =      "Journal of Business",
-  volume =       "36",
-  pages =        "394--419",
-  year =         "1963",
-  annote =       "Référence pour les distributions stables en finance",
-}
-
-@Article{65a:fam,
-  author =       "E. F. Fama",
-  title =        "The behavior of stock market prices",
-  journal =      "Journal of Business",
-  volume =       "38",
-  pages =        "34--105",
-  year =         "1965",
-  annote =       "Autre référence pour les distributions stables en
-                 finance",
-}
-
-@Article{96a:cor:gon:har,
-  author =       "R. M. Corless and G. H. Gonnet and D. E. G. Hare and
-                 D. J. Jeffrey and D. E. Knuth",
-  title =        "On the {Lambert} {W} Function",
-  journal =      "Advances in Computational Mathematics",
-  volume =       "5",
-  pages =        "329--359",
-  year =         "1996",
-  annote =       "Sert à résoudre les équations où une variable et son
-                 logarithme (ou exponentielle) apparaissent
-                 simultanément",
-}
-
-@Book{97b:emb:klu:mik,
-  author =       "P. Embrechts and C. Kluppelberg and T. Mikosch",
-  title =        "Modelling Extremal Events",
-  publisher =    "Springer",
-  year =         "1997",
-  series =       "Applications of Mathematics, Stochastic Modelling and
-                 Applied Probability",
-  annote =       "book on evt: theory, statistical methods for gev",
-}
-
-@Article{99a:kan:ser,
-  author =       "S. Kang and R. F. Serfozo",
-  title =        "Extreme values of phase-type and mixed random
-                 variables with parallel-processing examples",
-  journal =      "Journal of Applied Probability",
-  volume =       "36",
-  pages =        "194--210",
-  year =         "1999",
-  annote =       "limiting distribution of the maximum of r.v. i.i.d
-                 from a mixture is determined by the component of the
-                 mixture that has a dominant tail",
-}
-
-@TechReport{Abdallah+Plumbley-06,
-  author =       "Samer Abdallah and Mark Plumbley",
-  title =        "Geometry Dependency Analysis",
-  number =       "C4DM-TR06-05",
-  institution =  "Center for Digital Music, Queen Mary, University of
-                 London",
-  year =         "2006",
-}
-
-@Article{Abe+Warmuth92,
-  author =       "N. Abe and M. K. Warmuth",
-  title =        "On the Computational Complexity of Approximating
-                 Distributions by Probabilistic Automata",
-  journal =      "Machine Learning",
-  volume =       "9",
-  month =        jul,
-  year =         "1992",
-}
-
-@Article{Abu-Mostafa-hints,
-  author =       "Y. S. Abu-Mostafa",
-  title =        "Learning from Hints in Neural Networks",
-  journal =      jcomp,
-  volume =       "6",
-  pages =        "192--198",
-  year =         "1990",
-}
-
-@Article{Abu-Mostafa87,
-  author =       "Y. S. Abu-Mostafa and D. Psaltis",
-  title =        "Optical Neural Computers",
-  journal =      sciam,
-  volume =       "256",
-  pages =        "88--95",
-  month =        mar,
-  year =         "1987",
-}
-
-@Article{Abu-Mostafa89,
-  author =       "Y. S. Abu-Mostafa",
-  title =        "The {Vapnik}-{Chervonenkis} Dimension: Information
-                 versus Complexity in Learning",
-  journal =      nc,
-  volume =       "1",
-  pages =        "312--317",
-  year =         "1989",
-}
-
-@Article{abumostafa95,
-  author =       "Yaser S. Abu-Mostafa",
-  title =        "Hints",
-  journal =      "Neural Computation",
-  volume =       "7",
-  number =       "4",
-  pages =        "639--671",
-  month =        jul,
-  year =         "1995",
-}
-
-@misc{Ackerman+BenDavid-2008,
-    author = "Margareta Ackerman and Shai Ben-David",
-    title = "Clustering Quality Measures",
-    year = 2008,
-    note = "{\em Snowbird Learning Workshop}",
-}
-
-@Article{Ackley85,
-  author =       "D. H. Ackley and G. E. Hinton and T. J. Sejnowski",
-  title =        "A Learning Algorithm for {Boltzmann} Machines",
-  journal =      cogsci,
-  volume =       "9",
-  pages =        "147--169",
-  year =         "1985",
-}
-
-@InProceedings{Ackley90,
-  author =       "D. H. Ackley and M. S. Littman",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Generalization and Scaling in Reinforcement Learning",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "550--557",
-  year =         "1990",
-}
-
-@Article{ACM:Rohwer94,
-  author =       "R. Rohwer",
-  title =        "The time dimension of neural network models",
-  journal =      "ACM Sigart Bulleting",
-  volume =       "5",
-  number =       "3",
-  pages =        "36--44",
-  month =        jul,
-  year =         "1994",
-}
-
-@article{AdelsonBergen1985,
-    author={E. H. Adelson and J. R. Bergen},
-    title={Spatiotemporal Energy Models for the Perception of Motion},
-    journal={Journal of the Optical Society of America},
-    volume=2,
-    number=2,
-    year=1985,
-    pages={284-99},
-}
-
-@Article{Agrawala70,
-  author = 	 {Ashok Kumar Agrawala},
-  title = 	 {Learning with a Probabilistic Teacher},
-  journal = 	 {IEEE Transactions on Information Theory},
-  year = 	 1970,
-  volume =	 16,
-  pages =	 {373-379}
-}
-
-@Article{Ahalt90,
-  author =       "S. C. Ahalt and A. K. Krishnamurthy and P. Chen and D.
-                 E. Melton",
-  title =        "Competitive Learning Algorithms for Vector
-                 Quantization",
-  journal =      nn,
-  volume =       "3",
-  pages =        "277--290",
-  year =         "1990",
-}
-
-@InProceedings{Ahmad93,
-  author =       "S. Ahmad and V. Tresp",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Some Solutions to the Missing Feature Problem in
-                 Vision",
-  publisher =    "Morgan Kaufman Publishers",
-  address =      "San Mateo, CA",
-  year =         "1993",
-}
-
-@inproceedings{Ahmed2008,
- author = {Amr Ahmed and Kai Yu and Wei Xu and Yihong Gong and Eric P. Xing},
- booktitle = {Proceedings of the 10th European Conference on Computer Vision (ECCV'08)},
- title = {Training Hierarchical Feed-forward Visual Recognition Models Using Transfer Learning from Pseudo Tasks},
- year = 2008,
- pages = "69--82",
-}
-
-@article{AitchisonJ1976,
-	author = {John Aitchison and Colin Aitken},
-	journal = {Biometrika},
-	number = {3},
-	pages = {413--420},
-	title = {Multivariate binary discrimination by the kernel method},
-	volume = {63},
-	year = {1976}
-}
-
-@Article{Aizerman64,
-  author =       "Mark A. Aizerman and Emmanuel M. Braverman and Lev I.
-                 Rozonoer",
-  title =        "Theoretical Foundations of the Potential Function
-                 Method in Pattern Recognition Learning",
-  journal =      "Automation and Remote Control",
-  volume =       "25",
-  pages =        "821--837",
-  year =         "1964",
-}
-
-@Article{Ajtai83,
-  author =       "Miklos Ajtai",
-  title =        "$\sum_1^1$-formulae on finite structures",
-  journal =      "Annals of Pure and Applied Logic",
-  volume =       "24",
-  number =      "1",
-  pages =        "1--48",
-  year =         "1983",
-}
-
-@Article{Akaike74,
-  author =       "H. Akaike",
-  title =        "A New Look at the Statistical Model Identification",
-  journal =      ieeeac,
-  volume =       "AC-19",
-  number =       "6",
-  pages =        "716--728",
-  year =         "1974",
-}
-
-@Article{Al-Mashouq-hints,
-  author =       "K. A. Al-Mashouq and I. S. Reed",
-  title =        "Including Hints in Training Neural Nets",
-  journal =      nc,
-  volume =       "3",
-  number =       "4",
-  pages =        "418--430",
-  year =         "1991",
-}
-
-@Book{Aleksander:90,
-  author =       "I. Aleksander and H. Morton",
-  title =        "An Introduction to Neural Computing",
-  publisher =    "Chapman and Hall",
-  address =      "London",
-  year =         "1990",
-  keywords =     "",
-}
-
-@InProceedings{Aleksander:93,
-  author =       "I. Aleksander and H. Morton",
-  editor =       "J. Mira and J. Cabestany and A. Prieto",
-  booktitle =    "New Trends in Neural Computation: Proc. of the
-                 International Workshop on Artificial Neural Networks
-                 IWANN'93",
-  title =        "A Neural State Machine for Iconic Language
-                 Representation",
-  publisher =    "Springer",
-  address =      "Berlin, Heidelberg",
-  pages =        "84--89",
-  year =         "1993",
-  keywords =     "",
-}
-
-@InProceedings{Allender96,
-  author =       "Eric Allender",
-  booktitle =    "16th Annual Conference on Foundations of Software
-                 Technology and Theoretical Computer Science",
-  title =        "Circuit Complexity Before the Dawn of the New
-                 Millennium",
-  publisher =    "Lecture Notes in Computer Science 1180, Springer
-                 Verlag",
-  pages =        "1--18",
-  year =         "1996",
-}
-
-@InProceedings{Alleva93,
-  author =       "F. Alleva and X. Huang and M. Y. Hwang",
-  booktitle =    icassp,
-  title =        "An improved search algorithm using incremental
-                 knowledge for continuous speech recognition",
-  address =      "Minneapolis, Minnesota",
-  pages =        "307--310",
-  year =         "1993",
-}
-
-@Book{Allgower80,
-  author =       "E. L. Allgower and K. Georg",
-  title =        "Numerical Continuation Methods. {A}n Introduction",
-  number =       "13",
-  publisher =    "Springer-Verlag",
-  year =         "1980",
-  series =       "Springer Series in Computational Mathematics",
-}
-
-@Book{Allgower80-short,
-  author =       "E. L. Allgower and K. Georg",
-  title =        "Numerical Continuation Methods. {A}n Introduction",
-  publisher =    "Springer-Verlag",
-  year =         "1980",
-}
-
-@InProceedings{Almeida87,
-  author =       "L. B. Almeida",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "A Learning Rule for Asynchronous Perceptrons with
-                 Feedback in a Combinatorial Environment",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "609--618",
-  year =         "1987",
-}
-
-@InProceedings{Almeida88,
-  author =       "L. B. Almeida",
-  editor =       "R. Eckmiller and Ch. von der Malsburg",
-  booktitle =    "Neural Computers",
-  title =        "Backpropagation in Perceptrons with Feedback",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Neuss 1987",
-  pages =        "199--208",
-  year =         "1988",
-}
-
-@inproceedings{Almuallim+Dietterich-1991,
-    address = {Anaheim, California},
-    author = {Almuallim, H.  and Dietterich, T. G.},
-    booktitle = {Proceedings of the Ninth National Conference on Artificial Intelligence},
-    pages = {547--552},
-    publisher = {AAAI Press},
-    title = {Learning with many irrelevant features},
-    url = "http://citeseer.ist.psu.edu/almuallim91learning.html",
-    volume = {2},
-    year = {1991}
-}
-
-@article{Almuallim+Dietterich-1994,
-    author = "Hussein Almuallim and Thomas G. Dietterich",
-    title = "Learning Boolean Concepts in the Presence of Many Irrelevant Features",
-    journal = "Artificial Intelligence",
-    volume = "69",
-    number = "1-2",
-    pages = "279-305",
-    year = "1994",
-    url = "citeseer.ist.psu.edu/almuallim94learning.html"
-}
-
-
-@InProceedings{Alspector87,
-  author =       "J. Alspector and R. B. Allen",
-  editor =       "P. Losleben",
-  booktitle =    "Advanced Research in VLSI: Proceedings of the 1987
-                 Stanford Conference",
-  title =        "A Neuromorphic {VLSI} Learning System",
-  publisher =    "MIT Press, Cambridge",
-  pages =        "313--349",
-  year =         "1987",
-}
-
-@InProceedings{Alspector88,
-  author =       "J. Alspector and R. B. Allen and V. Hu and S.
-                 Satyanarayana",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Stochastic Learning Networks and Their Electronic
-                 Implementation",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "9--21",
-  year =         "1988",
-}
-
-@Article{Amari+Wu-99,
-  author =       "S. Amari and S. Wu",
-  title =        "Improving {Support} {Vector} {Machine} classifiers by
-                 modifying kernel functions",
-  journal =      "Neural Networks",
-  volume =       "12",
-  pages =        "783--789",
-  year =         "1999",
-}
-
-@Article{amari00adaptive,
-  author =       "{Shun-ichi} Amari and Hyeyoung Park and Kenji Fukumizu",
-  title =        "Adaptive Method of Realizing Natural Gradient Learning
-                 for Multilayer Perceptrons",
-  journal =      "Neural Computation",
-  volume =       "12",
-  number =       "6",
-  pages =        "1399--1409",
-  year =         "2000",
-  URL =          "citeseer.ist.psu.edu/amari98adaptive.html",
-}
-
-@Article{Amari77,
-  author =       "S. A. Amari",
-  title =        "Dynamics of Pattern Formation in Lateral-Inhibition
-                 Type Neural Fields",
-  journal =      biocyb,
-  volume =       "27",
-  pages =        "77--87",
-  year =         "1977",
-}
-
-@Article{Amari80,
-  author =       "S. A. Amari",
-  title =        "Topographic Organization of Nerve Fields",
-  journal =      bmbiol,
-  volume =       "42",
-  pages =        "339--364",
-  year =         "1980",
-}
-
-@Article{amari98natural,
-  author =       "{Shun-ichi} Amari",
-  title =        "Natural Gradient Works Efficiently in Learning",
-  journal =      "Neural Computation",
-  volume =       "10",
-  number =       "2",
-  pages =        "251--276",
-  year =         "1998",
-  URL =          "citeseer.ist.psu.edu/article/amari98natural.html",
-}
-
-@Article{Amari99,
-  author =       "S. Amari and S. Wu",
-  title =        "Improving Support Vector Machine Classifiers by
-                 Modifying Kernel Functions",
-  journal =      "Neural Networks",
-  volume =       "12",
-  number =       "6",
-  pages =        "783--789",
-  year =         "1999",
-}
-
-@article{AmariS1997,
-	author = {{Shun-ichi} Amari and Noboru Murata and Klaus-Robert M{\"u}ller and Michael Finke  and Howard Hua Yang },
-	journal = {IEEE Transactions on Neural Networks},
-	keywords = {regularization},
-	number = {5},
-	pages = {985--996},
-	title = {Asymptotic statistical theory of overtraining and cross-validation},
-	volume = {8},
-	year = {1997}
-}
-
-@InProceedings{amaya01improvement,
-  author =       "Fredy A. Amaya and Jose-Miguel Bened\`{i}",
-  booktitle =    "Meeting of the Association for Computational
-                 Linguistics",
-  title =        "Improvement of a Whole Sentence Maximum Entropy
-                 Language Model Using Grammatical Features",
-  pages =        "10--17",
-  year =         "2001",
-  URL =          "citeseer.nj.nec.com/505752.html",
-}
-
-@InProceedings{BoufadenLapalmeBengio2001,
-  author =       "N. Boufaden and Lapalme G. and Bengio Y.",
-  booktitle =    "Proceedings of the Natural Language Pacific Rim Symposium, NLPRS-01",
-  title =        "Topic segmentation: First Stage of Dialogue-Based Information extraction Process",
-  year =         "2001",
-}
-
-@Article{Amit85a,
-  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
-  title =        "Spin-Glass Models of Neural Networks",
-  journal =      prA,
-  volume =       "32",
-  pages =        "1007--1018",
-  year =         "1985",
-}
-
-@Article{Amit85b,
-  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
-  title =        "Storing Infinite Numbers of Patterns in a Spin-Glass
-                 Model of Neural Networks",
-  journal =      prl,
-  volume =       "55",
-  pages =        "1530--1533",
-  year =         "1985",
-}
-
-@Article{Amit87a,
-  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
-  title =        "Statistical Mechanics of Neural Networks Near
-                 Saturation",
-  journal =      annphys,
-  volume =       "173",
-  pages =        "30--67",
-  year =         "1987",
-}
-
-@Article{Amit87b,
-  author =       "D. Amit and H. Gutfreund and H. Sompolinsky",
-  title =        "Information Storage in Neural Networks with Low Levels
-                 of Activity",
-  journal =      prA,
-  volume =       "35",
-  pages =        "2293--2303",
-  year =         "1987",
-}
-
-@Article{Amit88,
-  author =       "D. Amit",
-  title =        "Neural Networks for Counting Chimes",
-  journal =      PNAS,
-  volume =       "85",
-  pages =        "2141--2145",
-  year =         "1988",
-}
-
-@Book{Amit89,
-  author =       "D. Amit",
-  title =        "Modelling Brain Function",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge",
-  year =         "1989",
-}
-
-@Article{Ammar+Miao-2000,
-  author =       "Hany H. Ammar and Zhouhui Miao",
-  title =        "Parallel Algorithms for the Training Process of a
-                 Neural Network-Based System",
-  journal =      "International Journal of High Performance Computing
-                 Applications",
-  volume =       "14",
-  number =       "1",
-  pages =        "3--25",
-  year =         "2000",
-  URL =          "http://hpc.sagepub.com/cgi/content/abstract/14/1/3",
-  doi =          "10.1177/109434200001400101",
-  eprint =       "http://hpc.sagepub.com/cgi/reprint/14/1/3.pdf",
-}
-
-@Book{Anderson,
-  author =       "T. Anderson",
-  title =        "An Introduction to Multivariate Statistical
-                 Analysis.",
-  publisher =    "John Wiley and Sons",
-  address =      "New York",
-  year =         "1984",
-}
-
-@Article{Anderson68,
-  author =       "J. A. Anderson",
-  title =        "A Memory Model Using Spatial Correlation Functions",
-  journal =      kyb,
-  volume =       "5",
-  pages =        "113--119",
-  year =         "1968",
-}
-
-@Article{Anderson70,
-  author =       "J. A. Anderson",
-  title =        "Two Models for Memory Organization",
-  journal =      mbio,
-  volume =       "8",
-  pages =        "137--160",
-  year =         "1970",
-}
-
-@book{Hinton+Anderson-81,
- author = {G.E. Hinton and J.A. Anderson},
- title = {Parallel models of associative memory},
- publisher = {Lawrence Erlbaum Assoc.},
- address = {Hillsdale, NJ},
- year = 1981,
-}
-
-@InCollection{Anderson81,
-  author =       "J. A. Anderson and M. C. Mozer",
-  editor =       "G. E. Hinton and J. A. Anderson",
-  booktitle =    "Parallel Models of Associative Memory",
-  title =        "Categorization and Selective Neurons",
-  publisher =    "Lawrence Erlbaum",
-  address =      "Hillsdale",
-  pages =        "213--236",
-  year =         "1981",
-}
-
-@Article{Anderson86,
-  author =       "D. Z. Anderson",
-  title =        "Coherent Optical Eigenstate Memory",
-  journal =      optlett,
-  volume =       "11",
-  pages =        "56--58",
-  year =         "1986",
-}
-
-@Article{Anderson87,
-  author =       "C. H. Anderson and D. C. Van Essen",
-  title =        "Shifter Circuits: {A} Computational Strategy for
-                 Dynamic Aspects of Visual Processing",
-  journal =      PNAS,
-  volume =       "84",
-  pages =        "6297--6301",
-  year =         "1987",
-}
-
-@Book{Anderson88,
-  editor =       "J. A. Anderson and E. Rosenfeld",
-  title =        "Neurocomputing: Foundations of Research",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1988",
-}
-
-@InProceedings{Anderson89,
-  author =       "S. Anderson and J. W. L. Merrill and R. Port",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "Dynamic Speech Categorization with Recurrent
-                 Networks",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "398--406",
-  year =         "1989",
-}
-
-@Article{Ando+Zhange-JMLR-2005,
-  author =       "Rie Kubota Ando and Tong Zhang",
-  title =        "A Framework for Learning Predictive Structures from
-                 Multiple Tasks and Unlabeled Data",
-  journal =      jmlr,
-  volume =       "6",
-  pages =        "1817--1853",
-  year =         "2005",
-}
-
-@Article{Andrieu03,
-  author =       "Christophe Andrieu and Nando de Freitas and Arnaud
-                 Doucet and Michael I. Jordan",
-  title =        "An Introduction to {MCMC} for Machine Learning",
-  journal =      "Machine Learning",
-  volume =       "50",
-  number =       "1-2",
-  pages =        "5--43",
-  year =         "2003",
-}
-
-@Article{Andrieu2003,
-  author =       "C. Andrieu and N. de Freitas and A. Doucet and M.
-                 Jordan",
-  title =        "An introduction to {MCMC} for machine learning",
-  journal =      "Machine Learning",
-  volume =       "50",
-  pages =        "5--43",
-  year =         "2003",
-}
-
-@Article{Angeniol88,
-  author =       "B. Ang\'eniol and G. de La Croix Vaubois and J.-Y. Le
-                 Texier",
-  title =        "Self-Organizing Feature Maps and the Travelling
-                 Salesman Problem",
-  journal =      nn,
-  volume =       "1",
-  pages =        "289--293",
-  year =         "1988",
-}
-
-@Article{Angluin83,
-  author =       "D. Angluin and C. Smith",
-  title =        "Inductive Inference: Theory and Methods",
-  journal =      "Computing Surveys",
-  volume =       "15",
-  number =       "3",
-  pages =        "237--269",
-  year =         "1983",
-}
-
-@Book{Arbib87,
-  author =       "M. A. Arbib",
-  title =        "Brains, Machines, and Mathematics",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  year =         "1987",
-}
-
-@Book{ARP94,
-  author =       "{Advanced Research Projects Agency}",
-  title =        "Proceedings of the 1994 {ARPA} Human Language
-                 Technology Workshop (Princeton, New Jersey, March
-                 1994)",
-  publisher =    "Morgan Kaufmann",
-  year =         "1994",
-}
-
-@Misc{Asuncion+Newman:2007,
-  author =       "A. Asuncion and D. J. Newman",
-  title =        "{UCI} Machine Learning Repository",
-  institution =  "University of California, Irvine, School of
-                 Information and Computer Sciences",
-  year =         "2007",
-  URL =          "http://www.ics.uci.edu/$\sim$mlearn/MLRepository.html",
-}
-
-@article{ashetal04,
-author = "Ash, J. and Berg, M. and Coiera, E.",
-title = "Some unintended consequences of 
-information technology in health care: the nature of patient care 
-information system-related errors",
-journal = "J Am Med Inform Assoc",
-volume = "11",
-number = 2,
-pages = "104-112",
-year = 2004,
-}
-
-@article{ashetal07,
-author = "Ash, J. and Sittig, D. and Dykstra, R. and Guappone, K. and 
-Carpenter, J. and Seshadri, V.",
-title = "Categorizing the unintended sociotechnical consequences of 
-computerized provider order entry",
-journal = "Int J Med Inform",
-volume = 76,
-number = "Suppl1",
-pages = "21-27",
-year = 2007,
-}
-
-@InProceedings{Atal83,
-  author =       "B. S. Atal",
-  booktitle =    icassp,
-  title =        "Efficient coding of {LPC} parameters by temporal
-                 decomposition",
-  address =      "Boston, MA",
-  pages =        "81--84",
-  year =         "1983",
-}
-
-@PhdThesis{Athaide95,
-  author =       "C. R. Athaide",
-  title =        "Likelihood estimation and state estimation for
-                 nonlinear state space models",
-  school =       "Graduate Group in Managerial Science and Applied
-                 Economics, University of Pennsylvania",
-  address =      "Philadelphia, PA",
-  year =         "1995",
-}
-
-@Book{Atherton-75,
-  author =       "D. P. Atherton",
-  title =        "Nonlinear Control Engineering",
-  publisher =    "Van Nostrand Reinhold",
-  address =      "Wokingam (England)",
-  year =         "1975",
-}
-
-@Article{atkeson96locally,
-  author =       "C. G. Atkeson and A. W. Moore and S. Schaal",
-  title =        "Locally Weighted Learning for Control",
-  journal =      "Artificial Intelligence Review",
-  volume =       "11",
-  pages =        "75--113",
-  year =         "1997",
-}
-
-@InProceedings{Aubert94,
-  author =       "X. Aubert and C. Dugast and H. Ney and V. Steinbiss",
-  booktitle =    icassp,
-  title =        "Large vocabulary continuous speech recognition of
-                 {Wall} {Street} journal data",
-  address =      "Adelaide, Australia",
-  pages =        "129--132",
-  year =         "1994",
-}
-
-@InProceedings{Auer-96,
-  author =       "Peter Auer and Mark Herbster and Manfred K. Warmuth",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Exponentially Many Local Minima for Single Neurons",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "315--322",
-  year =         "1996",
-}
-
-@InProceedings{auer97,
-  author =       "Peter Auer",
-  booktitle =    "Proc. 14th International Conference on Machine
-                 Learning",
-  title =        "On learning from multi-instance examples: Empirical
-                 evaluation of a theoretical approach",
-  publisher =    "Morgan Kaufmann",
-  pages =        "21--29",
-  year =         "1997",
-}
-
-@InProceedings{b-cdmvqfa-97,
-  author =       "Jonathan Baxter",
-  booktitle =    "Proc. 14th International Conference on Machine
-                 Learning",
-  title =        "The canonical distortion measure for vector
-                 quantization and function approximation",
-  publisher =    "Morgan Kaufmann",
-  pages =        "39--47",
-  year =         "1997",
-}
-
-@InCollection{Bach-2007,
-  author =       "Francis Bach",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Active learning for misspecified generalized linear
-                 models",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "",
-  year =         "2007",
-}
-
-@Article{Bachmann87,
-  author =       "C. M. Bachmann and L. N. Cooper and A. Dembo and O.
-                 Zeitouni",
-  title =        "A Relaxation Model for Memory with High Storage
-                 Density",
-  journal =      PNAS,
-  volume =       "84",
-  pages =        "7529--7531",
-  year =         "1987",
-}
-
-@MastersThesis{Bachrach88,
-  author =       "J. Bachrach",
-  title =        "Learning to Represent State",
-  school =       "University of Massachusetts",
-  address =      "Amherst",
-  year =         "1988",
-}
-
-@Article{Back-nc91,
-  author =       "A. D. Back and A. C. Tsoi",
-  title =        "{FIR} and {IIR} Synapses: {A} New Neural Network
-                 Architecture for Time Series Modeling",
-  journal =      nc,
-  volume =       "3",
-  number =       "3",
-  pages =        "375--385",
-  year =         "1991",
-}
-
-@InCollection{Bahadur61,
-  author =       "R. R. Bahadur",
-  editor =       "H. Solomon",
-  booktitle =    "Studies in Item Analysis and Predictdion",
-  title =        "A representation of the joint distribution of
-                 responses to n dichotomous items",
-  publisher =    "Stanford University Press, California",
-  pages =        "158--168",
-  year =         "1961",
-}
-
-@InProceedings{bahl77,
-  author =       "L. R. Bahl and J. K. Baker and R. L. Mercer",
-  booktitle =    "94th Meeting of the Acoustical Society of America",
-  title =        "Perplexity: a measure of difficulty of speech
-                 recognition tasks",
-  address =      "Miami",
-  month =        dec,
-  year =         "1977",
-}
-
-@Article{Bahl83,
-  author =       "L. R. Bahl and F. Jelinek and R. L. Mercer",
-  title =        "A Maximum Likelihood Approach to Continuous Speech
-                 Recognition",
-  journal =      ieeetpami,
-  volume =       "5",
-  number =       "2",
-  pages =        "179--190",
-  month =        mar,
-  year =         "1983",
-}
-
-@InProceedings{Bahl86,
-  author =       "Lalit Bahl and Peter Brown and Peter {deSouza} and Robert Mercer",
-  booktitle =    icassp,
-  title =        "Maximum mutual information estimation of hidden Markov
-                 parameters for speech recognition",
-  address =      "Tokyo, Japan",
-  pages =        "49--52",
-  year =         "1986",
-}
-
-@Article{Bahl87,
-  author =       "L. R. Bahl and P. Brown and P. V. {de Souza} and R. L.
-                 Mercer",
-  title =        "Speech recognition with continuous-parameter hidden
-                 {Markov} models",
-  journal =      "Computer, Speech and Language",
-  volume =       "2",
-  pages =        "219--234",
-  year =         "1987",
-}
-
-@InProceedings{Bahl88,
-  author =       "L. R. Bahl and P. Brown and P. V. de Souza and R. L.
-                 Mercer",
-  booktitle =    icassp,
-  title =        "Speech recognition with continuous-parameter hidden
-                 {Markov} models",
-  address =      "New York, NY",
-  pages =        "40--43",
-  year =         "1988",
-}
-
-@Article{Bailey-Simon-60,
-  author =       "Robert A. Bailey and Leroy Simon",
-  title =        "Two Studies in Automobile Insurance Ratemaking",
-  journal =      "ASTIN Bulletin",
-  volume =       "1",
-  number =       "4",
-  pages =        "192--217",
-  year =         "1960",
-}
-
-@InCollection{Baker75,
-  author =       "J. K. Baker",
-  editor =       "D. R. Reddy",
-  booktitle =    "Speech Recognition",
-  title =        "Stochastic modeling for automatic speech
-                 understanding",
-  publisher =    "Academic Press",
-  address =      "New York",
-  pages =        "521--542",
-  year =         "1975",
-}
-
-@Book{Baker77,
-  author =       "C. T. H. Baker",
-  title =        "The numerical treatment of integral equations",
-  publisher =    "Clarendon Press",
-  address =      "Oxford",
-  year =         "1977",
-}
-
-@InProceedings{Baker98,
-  author =       "D. Baker and A. {McCallum}",
-  booktitle =    "SIGIR'98",
-  title =        "Distributional Clustering of Words for Text
-                 Classification",
-  year =         "1998",
-}
-
-@InProceedings{baker98berkeley,
-  author =       "Collin F. Baker and Charles J. Fillmore and John B.
-                 Lowe",
-  editor =       "Christian Boitet and Pete Whitelock",
-  booktitle =    "Proceedings of the Thirty-Sixth Annual Meeting of the
-                 {Association} for {Computational} {Linguistics} and
-                 Seventeenth International Conference on Computational
-                 Linguistics",
-  title =        "The {Berkeley} {FrameNet} Project",
-  publisher =    "Morgan Kaufmann Publishers",
-  address =      "San Francisco, California",
-  pages =        "86--90",
-  year =         "1998",
-}
-
-@InProceedings{Bakis76,
-  author =       "R. Bakis",
-  booktitle =    "19st Meeting of the Acoustic Society of America",
-  title =        "Continuous Speech Recognition via Centisecond Acoustic
-                 States",
-  month =        apr,
-  year =         "1976",
-}
-
-@Article{bakker03,
-  author =       "Bart Bakker and Tom Heskes",
-  title =        "Task clustering and gating for {B}ayesian multitask
-                 learning",
-  journal =      jmlr,
-  volume =       "4",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA, USA",
-  pages =        "83--99",
-  year =         "2003",
-  ISSN =         "1533-7928",
-}
-
-@Book{Baldi-Brunak-98,
-  author =       "Pierre Baldi and Soren Brunak",
-  title =        "Bioinformatics, the Machine Learning Approach",
-  publisher =    "MIT Press",
-  year =         "1998",
-}
-
-@Article{Baldi89,
-  author =       "Pierre Baldi and Kurt Hornik",
-  title =        "Neural Networks and Principal Component Analysis:
-                 Learning from Examples Without Local Minima",
-  journal =      nn,
-  volume =       "2",
-  pages =        "53--58",
-  year =         "1989",
-}
-
-@Article{Baldi94,
-  author =       "P. Baldi and Y. Chauvin and T. Hunkapiller and M.
-                 {McClure}",
-  title =        "Hidden Markov models of biological primary sequence
-                 information",
-  journal =      "Proc. Nat. Acad. Sci. (USA)",
-  volume =       "91",
-  number =       "3",
-  pages =        "1059--1063",
-  year =         "1995",
-}
-
-@Article{Ballard81,
-  author =       "D. H. Ballard",
-  title =        "Generalizing the Hough Transform to Detect Arbitrary
-                 Shapes",
-  journal =      "Pattern Recognition",
-  volume =       "13",
-  number =       "2",
-  pages =        "111--122",
-  year =         "1981",
-}
-
-@InProceedings{Baluja97,
-  author =       "S. Baluja",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "Genetic Algorithms and Explicit Search Statistics",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "",
-  year =         "1997",
-}
-
-@Article{Bar-Shalom78,
-  author =       "Y. Bar-Shalom",
-  title =        "Tracking methods in a multi-target environment",
-  journal =      "IEEE Trans. on Aut. Control",
-  volume =       "23",
-  pages =        "618--626",
-  year =         "1978",
-}
-
-@Book{Bar-Shalom93,
-  author =       "Y. Bar-Shalom and {X.-R.} Li",
-  title =        "Estimation and Tracking",
-  publisher =    "Artech House",
-  address =      "Boston, MA",
-  year =         "1993",
-}
-
-@InProceedings{Barber+Williams-nips9,
-  author =       "D. Barber and C. K. I. Williams",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "Gaussian Processes for {Bayesian} Classification via
-                 Hybrid Monte Carlo",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "340--346",
-  year =         "1997",
-}
-
-@InProceedings{Bareiss87,
-  author =       "E. R. Bareiss and B. Porter",
-  booktitle =    "Proceedings of the 4th International Workshop on
-                 Machine Learning",
-  title =        "Protos: An Exemplar-Based Learning Apprentice",
-  publisher =    "Morgan Kaufmann",
-  address =      "Irvine, CA",
-  pages =        "12--23",
-  year =         "1987",
-}
-
-@Article{Barhen89,
-  author =       "J. Barhen and S. Gulati and M. Zak",
-  title =        "Neural Learning of Constrained Nonlinear
-                 Transformations",
-  journal =      computer,
-  pages =        "67--76",
-  month =        jun,
-  year =         "1989",
-}
-
-@article{Nykamp+Ringach-2002,
- author = {D.Q. Nykamp and D.L. Ringach},
- title = {Full identification of a linear-nonlinear system via cross-correlation analysis},
- journal = {Journal of Vision}, 
- volume = 2,
- number = 1, 
- pages = {1--11},
- year = 2002,
-}
-
-@article{Wilson+Cowan-72,
- author = {Hugh R. Wilson and Jack D. Cowan},
- title = {Excitatory and inhibitory interactions in localized populations of model neurons},
- journal = {Biophysiology Journal},
- volume = 12,
- pages = {1--24},
- year = 1972,
-}
-
-@Article{Barlow89,
-  author =       "H. B. Barlow",
-  title =        "Unsupervised Learning",
-  journal =      nc,
-  volume =       "1",
-  pages =        "295--311",
-  year =         "1989",
-}
-
-@article{Barlow-2001,
-    address = {Cambridge, UK.},
-    author = {H. Barlow},
-    issn = {0954-898X},
-    journal = {Network: Computation in Neural Systems},
-    month = {August},
-    number = {3},
-    pages = {241--253},
-    title = {Redundancy reduction revisited},
-    url = {http://view.ncbi.nlm.nih.gov/pubmed/11563528},
-    volume = {12},
-    year = {2001},
-}
-
-@InProceedings{Barron+Barron88,
-  author =       "A. R. Barron and R. L. Barron",
-  editor =       "E. Wegman",
-  booktitle =    "Computing Science and Statistics, Proc. 20th Symp.
-                 Interface",
-  title =        "Statistical learning networks: {A} unifying view",
-  publisher =    "Amer. Statist. Assoc.",
-  address =      "Washington, DC",
-  pages =        "192--203",
-  year =         "1988",
-}
-
-@InProceedings{Barron89,
-  author =       "A. R. Barron",
-  booktitle =    "Proc. of the 28th conf. on Decision and Control",
-  title =        "Statistical properties of artificial neural networks",
-  address =      "Tampa, Florida",
-  pages =        "280--285",
-  year =         "1989",
-}
-
-@incollection{Barron91,
-  author =       "Andrew E.~Barron",
-  title =        "Complexity Regularization with Application to Artificial Neural Networks",
-  booktitle =      "Nonparametric Functional Estimation and Related Topics",
-  pages =        "561--576",
-  editor = "G.~Roussas",
-  year =         "1991",
-  publisher = "Kluwer Academic Publishers"
-}
-
-
-@Article{Bartal95,
-  author =       "Jie Lin and Yair Bartal and Robert E. Uhrig",
-  title =        "Nuclear Power Plant Transient Diagnostics Using
-                 Artificial Neural Networks that Allow {"}don't know{"}
-                 Classifications",
-  journal =      "Nuclear Technology",
-  volume =       "110",
-  pages =        "436--449",
-  month =        jun,
-  year =         "1995",
-}
-
-@Article{Bartlett+Uhrig92,
-  author =       "E. B. Bartlett and R. E. Uhrig",
-  title =        "Nuclear Power Plant Status Diagnostics Using an
-                 Artificial Neural Network",
-  journal =      "Nuclear Technology",
-  volume =       "97",
-  month =        mar,
-  year =         "1992",
-}
-
-@Article{Bartlett46,
-  author =       "M. S. Bartlett",
-  title =        "On the theoritical specification of sampling
-                 properties of autocorrelated time series",
-  journal =      "J. Royal Stat. Soc. B",
-  volume =       "8",
-  pages =        "27--41",
-  year =         "1946",
-}
-
-@Article{Bartlett92,
-  author =       "P. L. Bartlett and T. Downs",
-  title =        "Using Random Weights to train Multilayer Networks of
-                 Hard-Limiting Units",
-  journal =      ieeetrnn,
-  volume =       "3",
-  number =       "2",
-  pages =        "202--210",
-  year =         "1992",
-}
-
-@TechReport{Barto-tr91,
-  author =       "A. G. Barto and S. Bradtke and S. P. Singh",
-  title =        "Real-Time Learning and {Control} Using Asynchronous
-                 Dynamic Programming",
-  number =       "91-57",
-  institution =  "Univ. of Massachusetts (Computer Science)",
-  address =      "Amherst MA",
-  year =         "1991",
-}
-
-@Article{Barto81,
-  author =       "A. G. Barto and R. S. Sutton and P. S. Brouwer",
-  title =        "Associative Search Network: Reinforcement Learning
-                 Associative Memory",
-  journal =      "Biological Cybernetics",
-  volume =       "40",
-  year =         "1981",
-}
-
-@Article{Barto83,
-  author =       "A. G. Barto and R. S. Sutton and C. W. Anderson",
-  title =        "Neuronlike Adaptive Elements That Can Solve Difficult
-                 Learning Control Problems",
-  journal =      ieeesmc,
-  volume =       "13",
-  year =         "1983",
-}
-
-@Article{Barto85,
-  author =       "A. G. Barto and P. Anandan",
-  title =        "Pattern Recognizing Stochastic Learning Automata",
-  journal =      ieeesmc,
-  volume =       "15",
-  pages =        "360--375",
-  year =         "1985",
-}
-
-@InProceedings{Barto87,
-  author =       "A. G. Barto and M. I. Jordan",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Gradient Following Without Back-Propagation in Layered
-                 Networks",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "629--636",
-  year =         "1987",
-}
-
-@InCollection{Barto91,
-  author =       "A. G. Barto and R. S. Sutton and C. J. C. H. Watkins",
-  editor =       "M. Gabriel and J. W. Moore",
-  booktitle =    "Learning and Computational Neuroscience",
-  title =        "Learning and Sequential Decision Making",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1991",
-}
-
-@InCollection{Barto92,
-  author =       "A. G. Barto",
-  editor =       "W. T Miller and R. S. Sutton and P. J. Werbos",
-  booktitle =    "Neural Networks for Control",
-  title =        "Connectionist learning for control: an overview",
-  publisher =    "MIT Press",
-  year =         "1992",
-}
-
-@TechReport{Barto_tr91,
-  author =       "A. G. Barto and S. Bradtke and S. P. Singh",
-  title =        "Real-Time Learning and {CO}ntrol Using Asynchronous
-                 Dynamic Programming",
-  number =       "91-57",
-  institution =  "Univ. of Massachusetts (Computer Science)",
-  address =      "Amherst MA",
-  year =         "1991",
-}
-
-@Article{bassiouni95,
-  author =       "M. A. Bassiouni and A. Mukherjee",
-  title =        "Efficient Decoding of Compressed Data",
-  journal =      "Journal of the American Society for Information
-                 Science",
-  volume =       "46",
-  number =       "1",
-  pages =        "1--8",
-  year =         "1995",
-}
-
-@Article{Basu94,
-  author =       "A. Basu and E. B. Bartlett",
-  title =        "Detecting Faults in a Nuclear Power Plant by Using
-                 Dynamic Node Architecture Artificial Neural Networks",
-  journal =      "Nuclear Science and Engineering",
-  volume =       "116",
-  month =        apr,
-  year =         "1994",
-}
-
-@Article{battiti-89,
-  author =       "R. Battiti",
-  title =        "Accelerated Backpropagation Learning: Two Optimization
-                 Methods",
-  journal =      "Complex Systems",
-  volume =       "3",
-  pages =        "331--342",
-  year =         "1989",
-}
-
-@InProceedings{battiti-masulli-90,
-  author =       "E. Battiti and F. Masulli",
-  booktitle =    "Proceedings of Internationla Neural Network Conference
-                 (INNC 90, Paris)",
-  title =        "{BFGS} optimization for faster and automated
-                 supervised learning",
-  pages =        "757--760",
-  year =         "1990",
-}
-
-@Article{Battiti92,
-  author =       "T. Battiti",
-  title =        "First- and Second-Order Methods for Learning: Between
-                 Steepest Descent and {Newton}'s Method",
-  journal =      "Neural Computation",
-  volume =       "4",
-  type =         "Review",
-  number =       "2",
-  pages =        "141--166",
-  year =         "1992",
-}
-
-@Article{battiti:1994:ieeetnn,
-  author =       "R. Battiti",
-  title =        "Using Mutual Information for Selecting Features in
-                 Supervised Neural Net Learning",
-  journal =      "{IEEE} Transaction on Neural Networks",
-  volume =       "5",
-  number =       "4",
-  pages =        "537--550",
-  year =         "1994",
-}
-
-@article{Baudat+Anouar-2000,
-    author = {G. Baudat and F. Anouar},
-    title = {Generalized Discriminant Analysis Using a Kernel Approach},
-    journal = {Neural Computation},
-    volume = {12},
-    number = {10},
-    year = {2000},
-    issn = {0899-7667},
-    pages = {2385--2404},
-    doi = {http://dx.doi.org/10.1162/089976600300014980},
-    publisher = {MIT Press},
-    address = {Cambridge, MA, USA},
-}
-
-@Article{Baum66,
-  author =       "L. E. Baum and T. Petrie",
-  title =        "Statistical Inference for Probabilistic Functions of
-                 Finite State {Markov} Chains",
-  journal =      "Ann. Math. Stat.",
-  volume =       "37",
-  pages =        "1559--1563",
-  year =         "1966",
-}
-
-@Article{Baum67,
-  author =       "L. E. Baum and J. Eagon",
-  title =        "An inequality with applications to statistical
-                 prediction for functions of {Markov} processes and to a
-                 model of ecology",
-  journal =      "Bull. Amer. Math. Soc.",
-  volume =       "73",
-  pages =        "360--363",
-  year =         "1967",
-}
-
-@Article{Baum70,
-  author =       "L. E. Baum and T. Petrie and G. Soules and N. Weiss",
-  title =        "A maximization technique occuring in the statistical
-                 analysis of probabilistic functions of {Markov}
-                 chains",
-  journal =      "Ann. Math. Statistic.",
-  volume =       "41",
-  pages =        "164--171",
-  year =         "1970",
-}
-
-@Article{Baum72,
-  author =       "L. E. Baum",
-  title =        "An inequality and associated maximization technique in
-                 statistical estimation for probabilistic functions of a
-                 {Markov} process",
-  journal =      "Inequalities",
-  volume =       "3",
-  pages =        "1--8",
-  year =         "1972",
-}
-
-@InProceedings{Baum86,
-  author =       "E. B. Baum",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "Towards Practical ``Neural'' Computation for
-                 Combinatorial Optimization Problems",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "53--58",
-  year =         "1986",
-}
-
-@InProceedings{Baum88,
-  author =       "E. B. Baum and F. Wilczek",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Supervised Learning of Probability Distributions by
-                 Neural Networks",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "52--61",
-  year =         "1988",
-}
-
-@Article{Baum89,
-  author =       "E. B. Baum and D. Haussler",
-  title =        "What Size Net Gives Valid Generalization?",
-  journal =      nc,
-  volume =       "1",
-  pages =        "151--160",
-  year =         "1989",
-}
-
-@Article{BaumNote,
-  author =       "E. B. Baum",
-  title =        "Review of {J}. {S}. {Judd}'s book {\em {Neural}
-                 {Network} {Design} and the {Complexity} of
-                 {Learning}}",
-  journal =      ieeetrnn,
-  volume =       "2",
-  number =       "1",
-  pages =        "181--182",
-  year =         "1991",
-}
-
-@Article{baxter00,
-  author =       "Jonathan Baxter",
-  title =        "A Model of Inductive Bias Learning.",
-  journal =      "J. Artif. Intell. Res. (JAIR)",
-  volume =       "12",
-  pages =        "149--198",
-  year =         "2000",
-}
-
-@InProceedings{baxter95a,
-  author =       "Jonathan Baxter",
-  booktitle =    colt95,
-  title =        "Learning Internal Representations",
-  publisher =    "ACM Press",
-  address =      "Santa Cruz, California",
-  pages =        "311--320",
-  year =         "1995",
-  url =          "http://citeseer.ist.psu.edu/baxter95learning.html",
-}
-
-@Unpublished{baxter95b,
-  author =       "Jonathan Baxter",
-  title =        "The Canonical Metric for Vector Quantization",
-  year =         "1995",
-  note =         "submitted to Information and Computation",
-}
-
-@InProceedings{baxter96,
-  author =       "Jonathan Baxter",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Learning Model Bias",
-  volume =       "8",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "169--175",
-  year =         "1996",
-}
-
-@Article{baxter97,
-  author =       "Jonathan Baxter",
-  title =        "A {Bayesian}/information theoretic model of learning via
-                 multiple task sampling",
-  journal =      "Machine Learning",
-  volume =       "28",
-  pages =        "7--40",
-  year =         "1997",
-}
-
-@Article{baxter97a,
-  author =       "Jonathan Baxter",
-  title =        "A {Bayesian}/Information theoretic model of learning to
-                 learn via multiple task sampling",
-  journal =      "Machine Learning",
-  volume =       "28",
-  pages =        "7--40",
-  year =         "1997",
-}
-
-@InProceedings{Becker89,
-  author =       "S. Becker and Y. {LeCun}",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "Improving the Convergence of Back-Propagation Learning
-                 with Second Order Methods",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "29--37",
-  year =         "1989",
-}
-
-@InProceedings{Belkin+al-2004,
-  author =       "Mikhail Belkin and Irina Matveeva and Partha Niyogi",
-  editor =       "John Shawe-Taylor and Yoram Singer",
-  booktitle =    colt04,
-  title =        "Regularization and Semi-supervised Learning on Large
-                 Graphs",
-  publisher =    "Springer",
-  pages =        "624-638",
-  year =         "2004",
-}
-
-@InProceedings{Belkin+Niyogi-2002,
-  author =       "Mikhail Belkin and Partha Niyogi",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Laplacian Eigenmaps and Spectral Techniques for
-                 Embedding and Clustering",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-  original =     "orig/AA42.ps",
-}
-
-@TechReport{Belkin+Niyogi-2002-01,
-  author =       "Mikhail Belkin and Partha Niyogi",
-  title =        "Laplacian Eigenmaps for Dimensionality Reduction and
-                 Data Representation",
-  number =       "TR-2002-01",
-  institution =  "University of Chicago, Computer Science",
-  year =         "2002",
-}
-
-@TechReport{Belkin+Niyogi-2002-ss,
-  author =       "Mkhail Belkin and Partha Niyogi",
-  title =        "Semi-supervised learning on manifolds",
-  number =       "TR-2002-12",
-  institution =  "University of Chicago, Computer Science",
-  year =         "2002",
-}
-
-@Article{Belkin+Niyogi-2003,
-  author =       "Mikhail Belkin and Partha Niyogi",
-  title =        "Laplacian Eigenmaps for Dimensionality Reduction and
-                 Data Representation",
-  journal =      "Neural Computation",
-  volume =       "15",
-  number =       "6",
-  pages =        "1373--1396",
-  year =         "2003",
-}
-
-@InProceedings{Belkin+Niyogi-nips2003,
-  author =       "Mikhail Belkin and Partha Niyogi",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Using Manifold Structure for Partially Labeled
-                 Classification",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  year =         "2003",
-}
-
-@article{BelkinM2006,
-	address = {Cambridge, MA, USA},
-	author = {Belkin, Mikhail   and Niyogi, Partha   and Sindhwani, Vikas  },
-	issn = {1533-7928},
-	journal = jmlr,
-	pages = {2399--2434},
-	publisher = {MIT Press},
-	title = {Manifold Regularization: A Geometric Framework for Learning from Labeled and Unlabeled Examples},
-	volume = {7},
-	year = {2006}
-}
-
-@Article{Bell-Sejnowski95,
-  author =       "Anthony J. Bell and Terrence J. Sejnowski",
-  title =        "An information maximisation approach to blind
-                 separation and blind deconvolution",
-  journal =      "Neural Computation",
-  volume =       "7",
-  number =       "6",
-  pages =        "1129--1159",
-  year =         "1995",
-}
-
-@InProceedings{Bellagarda+Nahamoo89,
-  author =       "J. R. Bellegarda and D. Nahamoo",
-  booktitle =    icassp,
-  title =        "Tied Mixture Continuous Parameter Models for Large
-                 Vocabulary Isolated Speech Recognition",
-  address =      "Glasgow, Scotland",
-  pages =        "13--16",
-  year =         "1989",
-}
-
-@InProceedings{Bellegarda97,
-  author =       "J. R. Bellegarda",
-  booktitle =    "Proceedings of Eurospeech 97",
-  title =        "A latent semantic analysis framework for large--span
-                 language modeling",
-  address =      "Rhodes, Greece",
-  pages =        "1451--1454",
-  year =         "1997",
-}
-
-@Book{Bellman57,
-  author =       "R. E. Bellman",
-  title =        "Dynamic Programming",
-  publisher =    "Princeton University Press",
-  address =      "NJ",
-  year =         "1957",
-}
-
-@Book{Bellman61,
-  author =       "R. Bellman",
-  title =        "Adaptive Control Processes: {A} Guided Tour",
-  publisher =    "Princeton University Press",
-  address =      "New Jersey",
-  year =         "1961",
-}
-
-@Book{Bellman74,
-  author =       "R. Bellman",
-  title =        "Introduction to Matrix Analysis",
-  publisher =    "McGraw-Hill",
-  address =      "New York, NY",
-  edition =      "2nd",
-  year =         "1974",
-}
-
-@InProceedings{ben-david03,
-  author =       "Shai Ben-David and Reba Schuller",
-  booktitle =    colt03,
-  title =        "Exploiting Task Relatedness for Mulitple Task
-                 Learning.",
-  crossref =     "colt03",
-  pages =        "567--580",
-  year =         "2003",
-}
-
-@InProceedings{BenDucVin01,
-  author =       "Yoshua Bengio and R\'ejean Ducharme and Pascal
-                 Vincent",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "A Neural Probabilistic Language Model",
-  publisher =    "MIT Press",
-  pages =        "932--938",
-  year =         "2001",
-}
-
-@InProceedings{BenDucVin01-small,
-  author =       "Yoshua Bengio and R\'ejean Ducharme and Pascal
-                 Vincent",
-  editor =       "Todd K. Leen and Thomas G. Dietterich and Volker
-                 Tresp",
-  booktitle =    "Advances in NIPS 13",
-  title =        "A Neural Probabilistic Language Model",
-  publisher =    "MIT Press",
-  pages =        "932--938",
-  year =         "2001",
-}
-
-@InProceedings{BenDucVin01-short,
-  author =       "Y. Bengio and R. Ducharme and P. Vincent",
-  booktitle =    "Adv. Neural Inf. Proc. Sys. 13",
-  title =        "A Neural Probabilistic Language Model",
-  pages =        "932--938",
-  year =         "2001",
-}
-
-@TechReport{Bengio+al-2004,
-  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le Roux}",
-  title =        "Efficient Non-Parametric Function Induction in
-                 Semi-Supervised Learning",
-  number =       "1247",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "2004",
-}
-
-@InCollection{Bengio+al-2005,
-  author =       "Yoshua Bengio and Nicolas {Le Roux} and Pascal Vincent and
-                 Olivier Delalleau and Patrice Marcotte",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Convex Neural Networks",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "123--130",
-  year =         "2006",
-}
-
-@InCollection{Bengio+al-2005-small,
-  author =       "Yoshua Bengio and Nicolas {Le Roux} and Pascal Vincent
-                 and Olivier Delalleau and Patrice Marcotte",
-  booktitle =    "NIPS 18",
-  title =        "Convex Neural Networks",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "123--130",
-  year =         "2006",
-}
-
-@InCollection{Bengio+al-spectral-2006-short,
-  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le
-                 Roux} and Jean-Francois Paiement and Pascal Vincent
-                 and Marie Ouimet",
-  editor =       "Isabelle Guyon and Steve Gunn and Masoud Nikravesh and
-                 Lofti Zadeh",
-  booktitle =    "Feature Extraction, Foundations and Applications",
-  title =        "Spectral Dimensionality Reduction",
-  publisher =    "Springer",
-  year =         "2006",
-}
-
-@InProceedings{Bengio+Bengio-NIPS99,
-  author =       "Yoshua Bengio and Samy Bengio",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Modeling High-Dimensional Discrete Data with
-                 Multi-Layer Neural Networks",
-  publisher =    "MIT Press",
-  pages =        "400--406",
-   year =         "2000",
-}
-
-@Article{Bengio+Bengio-trnn2000,
-  author =       "S. Bengio and Y. Bengio",
-  title =        "Taking on the Curse of Dimensionality in Joint
-                 Distributions Using Neural Networks",
-  journal =      "IEEE Transactions on Neural Networks, special issue on
-                 Data Mining and Knowledge Discovery",
-  volume =       "11",
-  number =       "3",
-  pages =        "550--557",
-  year =         "2000",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/jdm.pdf",
-}
-
-@Article{Bengio+Bengio-trnn2000-small,
-  author =       "S. Bengio and Y. Bengio",
-  title =        "Taking on the Curse of Dimensionality in Joint
-                 Distributions Using Neural Networks",
-  journal =      "IEEE Trans. Neural Networks",
-  volume =       "11",
-  number =       "3",
-  pages =        "550--557",
-  year =         "2000",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/jdm.pdf",
-}
-
-@Article{Bengio+Chapados2003,
-  author =       "Yoshua Bengio and Nicolas Chapados",
-  title =        "Extensions to Metric-Based Model Selection",
-  journal =      jmlr,
-  volume =       "3",
-  pages =        "1209--1227",
-  month =        mar,
-  year =         "2003",
-  note =         "Special Issue on Feature Selection",
-}
-
-@TechReport{Bergstra-TR2008,
-  author =       "James Bergstra and Yoshua Bengio and Jerome Louradour",
-  title =        "Image Classification with Biologically Motivated Neuron Models",
-  number =       "---",
-  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
-  year =         "2008",
-}
-
-@article{Bergstra-2009,
-  author =       "James Bergstra and Yoshua Bengio and Jerome Louradour",
-  title =        "Suitability of Complex Cell Models for Object Categorization",
-  journal = {Computational Neuroscience},
-  publisher = "submitted",
-  year = 2008,
-}
-
-@TechReport{Bengio+Frasconi94a,
-  author =       "Y. Bengio and P. Frasconi",
-  title =        "An {EM} Approach to Learning Sequential Behavior",
-  number =       "Tech. Report. DSI 11/94",
-  institution =  "Universit\`a di Firenze",
-  year =         "1994",
-}
-
-@article{Bengio-nc-2004,
- author = {Yoshua Bengio and Olivier Delalleau and Nicolas Le Roux and Jean-François Paiement and Pascal Vincent and Marie Ouimet},
- title = {Learning eigenfunctions links spectral embedding and kernel {PCA}},
- journal = {Neural Computation},
- volume = 16,
- number = 10,
- year = 2004,
- pages = {2197--2219},
-}
-
-@article{Bengio-nc-2004-small,
- author = {Yoshua Bengio and Olivier Delalleau and Nicolas Le Roux and Jean-François Paiement and Pascal Vincent and Marie Ouimet},
- title = {{\small{Learning eigenfunctions links spectral embedding and kernel {PCA}}}},
- journal = {Neural Comp.},
- volume = {16(10)},
- year = 2004,
- pages = {2197--2219},
-}
-
-@Article{Bengio+Grandvalet-JMLR-2004,
-  author =       "Yoshua Bengio and Yves Grandvalet",
-  title =        "No Unbiased Estimator of the Variance of {K}-Fold
-                 Cross-Validation",
-  journal =      jmlr,
-  volume =       "5",
-  pages =        "1089--1105",
-  year =         "2004",
-}
-
-@TechReport{Bengio+Grandvalet-TR-2003,
-  author =       "Yoshua Bengio and Yves Grandvalet",
-  title =        "No Unbiased Estimator of the Variance of {K}-Fold
-                 Cross-Validation",
-  number =       "TR-2003-1234",
-  institution =  "Universite de Montreal, dept. IRO",
-  year =         "2003",
-}
-
-@InCollection{Bengio+Lecun-chapter2007,
-  author =       "Yoshua Bengio and Yann {LeCun}",
-  editor =       "L. Bottou and O. Chapelle and D. DeCoste and J.
-                 Weston",
-  booktitle =    "Large Scale Kernel Machines",
-  title =        "Scaling Learning Algorithms towards {AI}",
-  publisher =    "MIT Press",
-  year =         "2007",
-}
-
-@InCollection{Bengio+Lecun-chapter2007-small,
-  author =       "Y. Bengio and Y. {LeCun}",
-  booktitle =    "Large Scale Kernel Machines",
-  title =        "Scaling Learning Algorithms towards {AI}",
-  year =         "2007",
-}
-
-@InProceedings{Bengio+LeCun94b,
-  author =       "Yoshua Bengio and Yann {LeCun}",
-  booktitle =    ICPR94,
-  title =        "Word Normalization For On-Line Handwritten Word
-                 Recognition",
-  pages =        "409--413",
-  year =         "1994",
-}
-
-@Article{Bengio+Monperrus+Larochelle-2006,
-  author =       "Yoshua Bengio and Martin Monperrus and Hugo
-                 Larochelle",
-  title =        "Nonlocal Estimation of Manifold Structure",
-  journal =      "Neural Computation",
-  volume =       "18",
-  number =       "10",
-  pages =        "2509--2528",
-  year =         "2006",
-}
-
-@InProceedings{Bengio+Monperrus-2005,
-  author =       "Yoshua Bengio and Martin Monperrus",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Non-Local Manifold Tangent Learning",
-  publisher =    "{MIT} Press",
-  year =         "2005",
-  pages =        "129--136",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/tangent\_learner\_nips2004.pdf",
-}
-
-@InProceedings{Bengio+Senecal-2003-small,
-  author =       "Yoshua Bengio and Jean-S\'ebastien Sen\'ecal",
-  booktitle =    "Proceedings of AISTATS 2003",
-  title =        "Quick Training of Probabilistic Neural Nets by
-                 Importance Sampling",
-  year =         "2003",
-}
-
-@TechReport{Bengio+Vincent+Paiement-TR2003,
-  author =       "Yoshua Bengio and Pascal Vincent and Jean-Fran{\cc}ois
-                 Paiement",
-  title =        "Learning Eigenfunctions of Similarity: Linking
-                 Spectral Clustering and Kernel {PCA}",
-  number =       "1232",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "2003",
-  URL =          "www.iro.umontreal.ca/~lisa/pointeurs/TR1232.pdf",
-}
-
-@TechReport{Bengio-decision-trees-TR-2007,
-  author =       "Yoshua Bengio and Olivier Delalleau and Clarence
-                 Simard",
-  title =        "Trees do not Generalize to New Variations",
-  number =       "",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "2007",
-}
-
-@TechReport{Bengio-decision-trees07,
-  author =       "Yoshua Bengio and Olivier Delalleau and Clarence
-                 Simard",
-  title =        "Decision Trees do not Generalize to New Variations",
-  number =       "1304",
-  institution =  "Universite de Montreal, Dept. IRO",
-  year =         "2007",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf",
-}
-
-%I deprecate the following one as this is a duplicate of the preceding tech report!
-%Their was only one .tex file that was using it. I modified it.
-@TechReport{Bengio-Trees-TR2007,
-  author =       "Yoshua Bengio and Olivier Delalleau and Clarence
-                 Simard",
-  title =        "Decision Trees do not Generalize to New Variations",
-  number =       "1304",
-  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
-  year =         "2007",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf",
-}
-
-@Article{Bengio-hmms99,
-  author =       "Yoshua Bengio",
-  title =        "Markovian Models for Sequential Data",
-  journal =      "Neural Computing Surveys",
-  volume =       "2",
-  pages =        "129--162",
-  year =         "1999",
-}
-
-@Article{bengio-hyper-NC00,
-  author =       "Yoshua Bengio",
-  title =        "Gradient-Based Optimization of Hyperparameters",
-  journal =      "Neural Computation",
-  volume =       "12",
-  number =       "8",
-  pages =        "1889--1900",
-  year =         "2000",
-}
-
-@TechReport{bengio-hyper-TR98,
-  author =       "Yoshua Bengio",
-  title =        "Continuous Optimization of Hyper-Parameters for
-                 Non-{IID} Data",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "1998",
-  note =         "unpublished manuscript",
-}
-
-@Article{Bengio-Hyper-Weight-Decay-nips,
-  author =       "Simon Latendresse and Yoshua Bengio",
-  title =        "Linear Regression and the Optimization of
-                 Hyper-Parameters",
-  journal =      "submitted to NIPS'99",
-  year =         "1999",
-}
-
-@TechReport{Bengio-Hyper-Weight-Decay-TR,
-  author =       "Yoshua Bengio and Simon Latendresse",
-  title =        "Soft Variable Selection with Numerical Optimization of
-                 Weight Decays",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "1999",
-  note =         "in preparation",
-}
-
-@Article{Bengio-ijns97,
-  author =       "Yoshua Bengio",
-  title =        "Using a Financial Training Criterion Rather than a
-                 Prediction Criterion",
-  journal =      "International Journal of Neural Systems",
-  year =         "1997",
-  volume =       {8},
-  number =       {4},
-  note =         "Special issue on noisy time-series",
-  pages =        {433--443},
-  URL =          "www.iro.umontreal.ca/~lisa/pointeurs/profitcost.ps",
-}
-
-@Article{Bengio-IEEETRNN-2001,
-  author =       "Yoshua Bengio and Vincent-Philippe Lauzon and R\'ejean
-                 Ducharme",
-  title =        "Experiments on the Application of {IOHMM}s to Model
-                 Financial Returns Series",
-  journal =      ieeetrnn,
-  volume = 12,
-  number = 1,
-  pages = {113--123},
-  year =         "2001",
-}
-
-@InProceedings{Bengio-Larochelle-NLMP-NIPS-2006,
-  author =       "Yoshua Bengio and Hugo Larochelle and Pascal Vincent",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Non-Local Manifold Parzen Windows",
-  publisher =    "MIT Press",
-  pages =        "115--122",
-  year =         "2006",
-}
-
-@TechReport{Bengio-Larochelle-NLMP-TR-2005,
-  author =       "Yoshua Bengio and Hugo Larochelle",
-  title =        "Non-Local Manifold Parzen Windows",
-  number =       "1264",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "2005",
-}
-
-%have been rejected and later accepted to NIPS in Bengio-localfailure-NIPS-2006
-@InProceedings{Bengio-localfailure-icml-2005,
-  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le
-                 Roux}",
-  booktitle =    "submitted to ICML 2005",
-  title =        "The Curse of Dimensionality for Local Kernel
-                 Machines",
-  year =         "2005",
-}
-
-@InCollection{Bengio-localfailure-NIPS-2006,
-  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le Roux}",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "The Curse of Highly Variable Functions for Local
-                 Kernel Machines",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =        "107--114",
-  year =         "2006",
-}
-
-@InCollection{Bengio-localfailure-NIPS-2006-small,
-  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le Roux}",
-  booktitle =    "NIPS 18",
-  title =        "The Curse of Highly Variable Functions for Local
-                 Kernel Machines",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =        "107--114",
-  year =         "2006",
-}
-
-@InProceedings{Bengio-localfailure-snowbird-2005,
-  author =       "Yoshua Bengio and Olivier Delalleau and Nicolas {Le
-                 Roux}",
-  booktitle =    "The Learning Workshop",
-  title =        "The Curse of Dimensionality for Local Kernel
-                 Machines",
-  address =      "Snowbird, Utah",
-  year =         "2005",
-}
-
-@InProceedings{HonglakLee-2007,
-  author =       "Honglak Lee and Alexis Battle and Rajat Raina and Andrew Ng",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Efficient sparse coding algorithms",
-  publisher =    "MIT Press",
-  pages =        "801--808",
-  year =         "2007",
-}
-
-@InProceedings{Bengio-nips-2006-small,
-  author =       "Y. Bengio and P. Lamblin and D. Popovici and
-                 H. Larochelle",
-  booktitle =    "Advances in NIPS 19",
-  title =        "Greedy Layer-Wise Training of Deep Networks",
-  year =         "2007",
-}
-
-@InProceedings{Bengio-nips-2006-short,
-  author =       "Y. Bengio and P. Lamblin and D. Popovici and
-                 H. Larochelle",
-  booktitle =    "Adv. Neural Inf. Proc. Sys. 19",
-  title =        "Greedy Layer-Wise Training of Deep Networks",
-  pages =        "153--160",
-  year =         "2007",
-}
-
-@InProceedings{Bengio-nips2004,
-  author =       "Yoshua Bengio and Jean-Fran\c{cois} Paiement and Pascal
-                 Vincent and Olivier Delalleau and Nicolas {Le Roux} and
-                 Marie Ouimet",
-  editor =       NIPS16ed,
-  booktitle =    NIPS16,
-  title =        "Out-of-Sample Extensions for {LLE}, {Isomap}, {MDS},
-                 {Eigenmaps}, and {Spectral} {Clustering}",
-  publisher =    "MIT Press",
-  year =         "2004",
-}
-
-@InProceedings{Bengio-nips2003,
-  author =       "Yoshua Bengio and Jean-Fran\c{cois} Paiement and Pascal
-                 Vincent and Olivier Delalleau and Nicolas {Le Roux} and
-                 Marie Ouimet",
-  editor =       NIPS16ed,
-  booktitle =    NIPS16,
-  title =        "Out-of-Sample Extensions for {LLE}, {Isomap}, {MDS},
-                 {Eigenmaps}, and {Spectral} {Clustering}",
-  publisher =    "MIT Press",
-  year =         "2004",
-}
-
-@InCollection{Bengio-NIPS2007,
-  author =       "Yoshua Bengio and Pascal Lamblin and Dan Popovici and
-                 Hugo Larochelle",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Greedy Layer-Wise Training of Deep Networks",
-  publisher =    "MIT Press",
-  pages =        "153--160",
-  year =         "2007",
-}
-
-@InProceedings{Bengio-nnlm2001,
-  author =       "Yoshua Bengio and R{\'e}jean Ducharme and Pascal Vincent",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "A Neural Probabilistic Language Model",
-  publisher =    "{MIT} Press",
-  pages =        "933--938",
-  year =         "2001",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/nips00-lm.ps",
-}
-
-@Article{Bengio-nnlm2003,
-  author =       "Yoshua Bengio and R{\'e}jean Ducharme and Pascal Vincent
-                 and Christian Jauvin",
-  title =        "A Neural Probabilistic Language Model",
-  journal =      jmlr,
-  volume =       "3",
-  pages =        "1137--1155",
-  year =         "2003",
-}
-
-@Article{Bengio-nnlm2003-small,
-  author =       "Y. Bengio and R. Ducharme and P. Vincent
-                 and C. Jauvin",
-  title =        "A Neural Probabilistic Language Model",
-  journal =      "JMLR",
-  volume =       "3",
-  pages =        "1137--1155",
-  year =         "2003",
-}
-
-@Article{Bengio-NonStat-Hyper-ML,
-  author =       "Yoshua Bengio and Charles Dugas",
-  title =        "Learning Simple Non-Stationarities with
-                 Hyper-Parameters",
-  journal =      "submitted to Machine Learning",
-  year =         "1999",
-}
-
-@Article{Bengio-prel92,
-  author =       "Y. Bengio and M. Gori and R. \mbox{De Mori}",
-  title =        "Learning the Dynamic Nature of Speech with
-                 Back-propagation for Sequences",
-  journal =      prel,
-  volume =       "13",
-  number =       "5",
-  pages =        "375--385",
-  year =         "1992",
-  note =         "(Special issue on Artificial Neural Networks)",
-}
-
-@Article{Bengio-2008,
-  author =       "Yoshua Bengio",
-  title =        "Learning Deep Architectures for {AI}",
-  journal =  {Foundations and Trends in Machine Learning},
-  year =         "2009",
-  volume = {to appear},
-}
-
-@Article{Bengio-2009-short,
-  author =       "Y. Bengio",
-  title =        "Learning Deep Architectures for {AI}",
-  journal =  {Foundations \& Trends in Mach. Learn.},
-  year =         "2009",
-  volume = 2,
-  number = 1,
-  pages = {1--127},
-}
-
-@TechReport{Bengio-TR1312-small,
-  author =       "Yoshua Bengio",
-  title =        "Learning Deep Architectures for {AI}",
-  number =       "1312",
-  institution =  "U. Montr\'eal, dept. IRO",
-  year =         "2007",
-}
-
-@InProceedings{Bengio-transducers-98,
-  author =       "Y. Bengio and S. Bengio and J. F. Isabelle and Y.
-                 Singer",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "Shared Context Probabilistic Transducers",
-  publisher =    "MIT Press",
-  pages =        "409--415",
-  year =         "1998",
-}
-
-@Article{Bengio-trnn92,
-  author =       "Y. Bengio and R. \mbox{De Mori} and G. Flammia and R.
-                 Kompe",
-  title =        "Global Optimization of a Neural Network-Hidden
-                 {Markov} Model Hybrid",
-  journal =      ieeetrnn,
-  volume =       "3",
-  number =       "2",
-  pages =        "252--259",
-  year =         "1992",
-}
-
-@Article{Bengio-trnn93,
-  author =       "Y. Bengio and P. Simard and P. Frasconi",
-  title =        "Learning Long-Term Dependencies with Gradient Descent
-                 is Difficult",
-  journal =      ieeetrnn,
-  volume =       "5",
-  number =       "2",
-  pages =        "157--166",
-  year =         "1994",
-  OPTnote =      "(Special Issue on Recurrent Neural Networks)",
-  url =          "http://www.iro.umontreal.ca/~lisa/pointeurs/ieeetrnn94.pdf",
-}
-
-@Article{Bengio-trnn96,
-  author =       "Y. Bengio and P. Frasconi",
-  title =        "Input/{Output} {HMM}s for Sequence Processing",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "7",
-  number =       "5",
-  pages =        "1231--1249",
-  year =         "1996",
-}
-
-@TechReport{Bengio2003,
-  author =       "Christopher Kermorvant and Yoshua Bengio",
-  title =        "Extracting Hidden Sense Probabilities from Bitexts",
-  number =       "1231",
-  institution =  "Université de Montréal",
-  year =         "2003",
-}
-
-@InProceedings{Bengio89b,
-  author =       "Y. Bengio and P. Cosi and R. Cardin and R. De Mori",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Use of multi-layered networks for coding speech with
-                 phonetic features",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "224--231",
-  year =         "1989",
-}
-
-@PhdThesis{Bengio91,
-  author =       "Yoshua Bengio",
-  title =        "Artificial Neural Networks and their Application to
-                 Sequence Recognition",
-  school =       "McGill University, (Computer Science)",
-  address =      "Montreal, Qc., Canada",
-  year =         "1991",
-}
-
-@InProceedings{bengio91x,
-  author =       "Y. Bengio and R. {De Mori} and G. Flammia and R.
-                 Kompe",
-  booktitle =    ijcnn,
-  title =        "Global Optimization of a Neural Network - Hidden
-                 Markov Model Hybrid",
-  volume =       "2",
-  pages =        "789--794",
-  year =         "1991",
-  OPTaddress =   "Seattle WA",
-}
-
-@article{Becker92,
- author = {Sue Becker and Geoffrey Hinton},
- title =        {A self-organizing neural network that discovers surfaces in random-dot stereograms},
- journal = {Nature},
- volume = 355,
- pages = {161--163},
- year = 1992,
-}
- 
-@Article{Bengio93,
-  author =       "Yoshua Bengio",
-  title =        "A Connectionist Approach to Speech Recognition",
-  journal =      "International Journal on Pattern Recognition and
-                 Artificial Intelligence",
-  volume =       "7",
-  number =       "4",
-  pages =        "647--668",
-  note =         "special issue entitled Advances in Pattern Recognition Systems using Neural Networks",
-  year =         "1993",
-}
-
-@InProceedings{Bengio93e,
-  author =       "S. Bengio and Y. Bengio and J. Cloutier and J.
-                 Gecsei",
-  editor =       "S. Gielen and B. Kappen",
-  booktitle =    "Proceedings of the International Conference on
-                 Artificial Neural Networks 1993",
-  title =        "Generalization of a Parametric Learning Rule",
-  publisher =    "Springer-Verlag",
-  address =      "Amsterdam, The Netherlands",
-  pages =        "502--502",
-  year =         "1993",
-}
-
-@Article{bengio:1999:nc,
-  author =       "S. Bengio and Y. Bengio and J. Robert and G.
-                 B\'elanger",
-  title =        "Stochastic Learning of Strategic Equilibria for
-                 Auctions",
-  journal =      "Neural Computation",
-  volume =       "11",
-  number =       "5",
-  pages =        "1199--1209",
-  year =         "1999",
-}
-
-@Article{bottou+al:1999,
-  author =       "L. Bottou and P. Haffner and P.G. Howard and P. Simard and Y. Bengio",
-  title =        "High quality document image compression with {DjVu}",
-  journal =      "Journal of Electronic Imaging",
-  volume =       "7",
-  number =       "3",
-  pages =        "410--425",
-  year =         "1998",
-}
-
-@Article{bengio+al:1998,
-  author =       "Y. Bengio and F. Gingras and B. Goulard and J.-M. Lina",
-  title =        "Gaussian Mixture Densities for Classification of Nuclear Power Plant Data",
-  journal =      "Computers and Artificial Intelligence, special issue on Intelligent Technologies for Electric and Nuclear Power Plants",
-  volume =       "17",
-  number =       "2--3",
-  pages =        "189--209",
-  year =         "1998",
-}
-
-@Article{GingrasBengio:1998,
-  author =       "F. Gingras and Y. Bengio",
-  title =        "Handling Asynchronous or Missing Financial Data with Recurrent Networks",
-  journal =      "International Journal of Computational Intelligence and Organizations",
-  volume =       "1",
-  number =       "3",
-  pages =        "154--163",
-  year =         "1998",
-}
-
-@Article{BengioS95,
-  author =       "S. Bengio and Y. Bengio and J. Cloutier",
-  title =        "On the search for new learning rules for {ANN}s",
-  journal =      "Neural Processing Letters",
-  volume =       "2",
-  number =       "4",
-  pages =        "26--30",
-  year =         "1995",
-}
-
-@Article{BengioMori89,
-  author =       "Y. Bengio and R. De Mori",
-  title =        "Use of multilayer networks for the recognition of phonetic features and phonemes",
-  journal =      "Computational Intelligence",
-  volume =       "5",
-  pages =        "134--141",
-  year =         "1989",
-}
-
-@TechReport{BengioTR1178,
-  author =       "Yoshua Bengio and R\'ejean Ducharme and Pascal
-                 Vincent",
-  title =        "A Neural Probabilistic Language Model",
-  number =       "1178",
-  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
-  year =         "2002",
-}
-
-@TechReport{BengioTR1215,
-  author =       "Yoshua Bengio",
-  title =        "New Distributed Probabilistic Language Models",
-  number =       "1215",
-  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
-  year =         "2002",
-}
-
-@Book{Bengio_book96,
-  author =       "Yoshua Bengio",
-  title =        "Neural Networks for Speech and Sequence Processing",
-  publisher =    "International Thomson Computer Press",
-  year =         "1996",
-}
-
-@InProceedings{Bengio_icnn93,
-  author =       "Y. Bengio and P. Frasconi and P. Simard",
-  booktitle =    icnn,
-  title =        "The problem of learning long-term dependencies in
-                 recurrent networks",
-  publisher =    "IEEE Press",
-  address =      "San Francisco",
-  pages =        "1183--1195",
-  year =         "1993",
-  note =         "(invited paper)",
-}
-
-@Article{Bengio_trnn94,
-  author =       "Y. Bengio and P. Simard and P. Frasconi",
-  title =        "Learning Long-Term Dependencies with Gradient Descent
-                 is Difficult",
-  journal =      ieeetrnn,
-  volume =       "5",
-  number =       "2",
-  pages =        "157--166",
-  year =         "1994",
-  note =         "Special Issue on Recurrent Neural Networks, March 94",
-}
-
-@Book{Benveniste90,
-  author =       "A. Benveniste and M. Metivier and P. Priouret",
-  title =        "Adaptive Algorithms and Stochastic Approximations",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin, New York",
-  year =         "1990",
-}
-
-@Book{Berger85,
-  author =       "J. Berger",
-  title =        "Statistical Decision Theory and {Bayesian} Analysis",
-  publisher =    "Springer",
-  year =         "1985",
-}
-
-@Misc{berger97improved,
-  author =       "A. Berger",
-  title =        "The improved iterative scaling algorithm: {A} gentle
-                 introduction",
-  year =         "1997",
-  URL =          "citeseer.ist.psu.edu/berger97improved.html",
-  text =         "Berger, A. (1997). The improved iterative scaling
-                 algorithm: A gentle introduction.
-                 http://www.cs.cmu.edu/afs/cs/user/aberger/www/ps/scaling.ps.",
-}
-
-@article{Berkes-Wiskott-2005,
-    author = {Berkes, Pietro and Wiskott, Laurenz},
-    title = {Slow Feature Analysis Yields a Rich Repertoire of Complex Cell Properties},
-    journal = {Journal of Vision},
-    ISSN = {1534-7362},
-    volume = {5},
-    number = {6},
-    pages = {579-602},
-    year = {2005},
-    month = {7}
-}
-
-@Article{Beurle56,
-  author =       "R. L. Beurle",
-  title =        "Properties of a Mass of Cells Capable of Regenerating
-                 Pulses",
-  journal =      PTRSL,
-  volume =       "240",
-  pages =        "55--94",
-  year =         "1956",
-}
-
-@InProceedings{Beyer+al-1999,
-  author =       "Kevin S. Beyer and Jonathan Goldstein and Raghu Ramakrishnan
-                 and Uri Shaft",
-  booktitle =    "Proceeding of the 7th International Conference on
-                 Database Theory",
-  title =        "When Is ``Nearest Neighbor'' Meaningful?",
-  publisher =    "Springer-Verlag",
-  pages =        "217--235",
-  year =         "1999",
-  ISBN =         "3-540-65452-6",
-}
-
-@TechReport{Bianchini-rbf,
-  author =       "M. Bianchini and P. Frasconi and M. Gori",
-  title =        "Learning without Local Minima in Radial Basis Function
-                 Networks",
-  institution =  "Universit\`a di Firenze",
-  year =         "1992",
-  OPTannote =    "",
-}
-
-@Article{Bianchini-trnn94,
-  author =       "M. Bianchini and M. Gori and M. Maggini",
-  title =        "On the Problem of Local Minima in Recurrent Neural
-                 Networks",
-  journal =      ieeetrnn,
-  volume =       "5",
-  number =       "2",
-  pages =        "167--177",
-  year =         "1994",
-  OPTnote =      "(Special Issue on Recurrent Neural Networks)",
-}
-
-@TechReport{bickel+ritov95,
-  author =       "P. J. Bickel and Y. Ritov",
-  title =        "Inference in hidden {Markov} models {I}: local
-                 asymptotic normality in the stationary case",
-  number =       "Technical Report 383",
-  institution =  "Statistics Department, University of California,
-                 Berkeley",
-  year =         "February 1994, revised April 1995",
-}
-
-@Article{Bienenstock82,
-  author =       "E. L. Bienenstock and L. N. Cooper and P. W. Munro",
-  title =        "Theory for the Development of Neuron Selectivity:
-                 Orientation Specificity and Binocular Interaction in
-                 Visual Cortex",
-  journal =      jneuro,
-  volume =       "2",
-  year =         "1982",
-}
-
-@Article{BierdermanI1987,
-  author =       "Irving Bierderman",
-  title =        "Recognition-by-Components: {A} Theory of Human Image
-                 Understanding",
-  journal =      "Psychological Review",
-  volume =       "94",
-  number =       "2",
-  publisher =    "American Psychological Association, Inc.",
-  pages =        "115--147",
-  year =         "1987",
-  added-by =     "Daniel Acevedo",
-  date-added =   "Thu Oct 24 12:45:17 2002",
-  project =      "genetic",
-  theme =        "perception and vr and tech and natural and medicine
-                 and art",
-}
-
-@InProceedings{Bilbro89a,
-  author =       "G. Bilbro and R. Mann and T. K. Miller and W. E.
-                 Snyder and D. E. Van den Bout and M. White",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Optimization by Mean Field Annealing",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "91--98",
-  year =         "1989",
-}
-
-@InProceedings{Bilbro89b,
-  author =       "G. L. Bilbro and W. Snyder",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Range Image Restoration Using Mean Field Annealing",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "594--601",
-  year =         "1989",
-}
-
-@Article{Binder86,
-  author =       "K. Binder and A. P. Young",
-  title =        "Spin Glasses: Experimental Facts, Theoretical
-                 Concepts, and Open Questions",
-  journal =      rmp,
-  volume =       "58",
-  pages =        "801--976",
-  year =         "1986",
-}
-
-@Book{Binder88,
-  author =       "K. Binder and D. W. Heerman",
-  title =        "Monte Carlo Simulation in Statistical Mechanics",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  year =         "1988",
-}
-
-@Book{bishop-book2006,
-  author =       "Christopher M. Bishop",
-  title =        "Pattern Recognition and Machine Learning",
-  publisher =    "Springer",
-  year =         "2006",
-}
-
-@Book{bishop-book95,
-  author =       "Christopher Bishop",
-  title =        "Neural Networks for Pattern Recognition",
-  publisher =    "Oxford University Press",
-  address =      "London, UK",
-  year =         "1995",
-}
-
-@Article{bishop92,
-  author =       "Christopher Bishop",
-  title =        "Exact calculation of the {Hessian} matrix for the
-                 multi-layer perceptron",
-  journal =      "Neural Computation",
-  volume =       "4",
-  number =       "4",
-  pages =        "494--501",
-  year =         "1992",
-}
-
-@Article{bishop95training,
-  author =       "Christopher M. Bishop",
-  title =        "Training with Noise is Equivalent to {Tikhonov}
-                 Regularization",
-  journal =      "Neural Computation",
-  volume =       "7",
-  number =       "1",
-  pages =        "108--116",
-  year =         "1995",
-}
-
-@Article{Blackscholes73,
-  author =       "F. Black and M. Scholes",
-  title =        "The Pricing of Options and Corporate Liabilities",
-  journal =      "Journal of Political Economy",
-  number =       "81",
-  pages =        "637--654",
-  year =         "1973",
-}
-
-@Article{Blakemore70,
-  author =       "C. Blakemore and G. F. Cooper",
-  title =        "Development of the Brain Depends on the Visual
-                 Environment",
-  journal =      nature,
-  volume =       "228",
-  pages =        "477--478",
-  year =         "1970",
-}
-
-@InCollection{Blitzer-nips17,
-  author =       "John Blitzer and Kilian Weinberger and Lawrence Saul
-                 and Fernando Pereira",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Hierarchical Distributed Representations for
-                 Statistical Language Modeling",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2005",
-}
-
-@InProceedings{Blitzer05,
-  author =       "John Blitzer and Kilian Weinberger and Lawrence Saul
-                 and Fernando Pereira",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Hierarchical Distributed Representations for
-                 Statistical Language Modeling",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2005",
-}
-
-@InProceedings{Blitzer2005,
-  author =       "J. Blitzer and K. Q. Weinberger and L. K. Saul and F.
-                 C. N. Pereira",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Hierarchical distributed representations for
-                 statistical language models",
-  publisher =    "{MIT} Press",
-  year =         "2005",
-}
-
-@Article{Block62,
-  author =       "H. D. Block",
-  title =        "The Perceptron: {A} Model for Brain Functioning",
-  journal =      rmp,
-  volume =       "34",
-  year =         "1962",
-}
-
-@InProceedings{Blum+Rivest,
-  author =       "A. Blum and R. L. Rivest",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Training a 3-node Neural Net is {NP}-Complete",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "494--501",
-  year =         "1989",
-}
-
-@InProceedings{blum01learning,
-  author =       "Avrim Blum and Shuchi Chawla",
-  booktitle =    "Proc. 18th International Conf. on Machine Learning",
-  title =        "Learning from Labeled and Unlabeled Data Using Graph
-                 Mincuts",
-  publisher =    "Morgan Kaufmann, San Francisco, CA",
-  pages =        "19--26",
-  year =         "2001",
-}
-  %URL =          "citeseer.ist.psu.edu/blum01learning.html",
-
-@InProceedings{blum98combining,
-  author =       "Avrim Blum and Tom Mitchell",
-  booktitle =    colt98,
-  publisher =    "Morgan Kaufmann Publishers",
-  title =        "Combining Labeled and Unlabeled Data with
-                 Co-training",
-  pages =        "92--100",
-  year =         "1998",
-}
-  %URL =          "citeseer.ist.psu.edu/blum98combining.html",
-
-@InProceedings{blum98combining-small,
-  author =       "Avrim Blum and Tom Mitchell",
-  booktitle =    "COLT'98",
-  title =        "Combining Labeled and Unlabeled Data with
-                 Co-training",
-  pages =        "92--100",
-  year =         "1998",
-}
-  %URL =          "citeseer.ist.psu.edu/blum98combining.html",
-
-@InProceedings{blum99,
-  author =       "A. Blum and A. Kalai and J. Langford",
-  booktitle =    colt99,
-  title =        "Beating the hold-out: Bounds for k-fold and
-                 progressive cross-validation",
-  pages =        "",
-  year =         "1999",
-}
-
-@InProceedings{Blumer86,
-  author =       "A. Blumer and A. Ehrenfeucht and D. Haussler and M.
-                 Warmuth",
-  booktitle =    "Proceedings of the Eighteenth Annual ACM Symposium on
-                 Theory of Computing",
-  title =        "Classifying Learnable Geometric Concepts with the
-                 Vapnik-Chervonenkis Dimension",
-  publisher =    "ACM, Salem",
-  address =      "Berkeley 1986",
-  pages =        "273--282",
-  year =         "1986",
-}
-
-@Article{Blumer87,
-  author =       "A. Blumer and A. Ehrenfeucht and D. Haussler and M.
-                 Warmuth",
-  title =        "Occam's razor",
-  journal =      "Inf. Proc. Let.",
-  volume =       "24",
-  pages =        "377--380",
-  year =         "1987",
-}
-
-@Article{Blumstein79,
-  author =       "S. E. Blumstein and K. N. Stevens",
-  title =        "Acoustic invariance in speech production: Evidence
-                 from measurements of the spectral characteristics of
-                 stop consonants",
-  journal =      "Journal of the Acoustical Society of America",
-  volume =       "66",
-  number =       "4",
-  pages =        "1001--1018",
-  year =         "1979",
-}
-
-@Article{Bohm96,
-  author =       "G. Bohm",
-  title =        "New approaches in molecular structure prediction",
-  journal =      "Biophys. Chem.",
-  volume =       "59",
-  pages =        "1--32",
-  year =         "1996",
-}
-
-@Article{Bohr88,
-  author =       "H. Bohr and J. Bohr and S. Brunak and R. M. J.
-                 Cotterill and B. Lautrup and L. Norskov and O. H.
-                 Olsen and S. B. Petersen",
-  title =        "Protein Secondary Structure and Homology by Neural
-                 Networks: The $\alpha$-Helices in Rhodopsin",
-  journal =      febsl,
-  volume =       "241",
-  pages =        "223--228",
-  year =         "1988",
-}
-
-@InProceedings{bollacker98,
-  author =       "Kurt D. Bollacker and Joydeep Ghosh",
-  booktitle =    ICML98,
-  editor =       ICML98ed,
-  publisher =    ICML98publ,
-  title =        "A Supra-Classifier Architecture for Scalable Knowledge
-                 Reuse",
-  address =      "San Francisco, CA, USA",
-  pages =        "64--72",
-  year =         "1998",
-}
-
-@InProceedings{BonillaE2007,
-  author =       "Edwin V. Bonilla and Felix V. Agakov and Christopher
-                 K. I. Williams",
-  booktitle =    "Proceedings of AISTATS 2007",
-  title =        "Kernel Multi-task Learning using Task-specific
-                 Features",
-  year =         "2007",
-}
-
-@Article{Bonomo94,
-  author =       "M. Bonomo and R. Garcia",
-  title =        "Can a well-fitted equilibrium asset-pricing model
-                 produce mean reversion?",
-  journal =      "Journal of Applied Econometrics",
-  volume =       "9",
-  pages =        "19--29",
-  year =         "1994",
-}
-
-@Article{bordes-09,
-  author =  {Bordes, Antoine and Bottou, L\'eon and Gallinari, Patrick},
-  title =   {SGD-QN: Careful Quasi-Newton Stochastic Gradient Descent},
-  journal = {Journal of Machine Learning Research},
-  year =    {2009},
-  volume =  {10},
-  pages =   {1737-1754},
-  month =   {July},
-}
-
-@Book{Bornstein-critical-87,
-		author = { Bornstein, Marc H. },
-		title = { Sensitive periods in development : interdisciplinary
-				perspectives / edited by Marc H. Bornstein },
-		publisher = { Lawrence Erlbaum Associates, Hillsdale, N.J. : },
-		year = { 1987 },
-		type = { Book },
-}
-
-
-@Article{boser-92,
-  author =       "B. Boser and E. Sackinger and J. Bromley and Y. {LeCun}
-                 and L. Jackel",
-  title =        "An analog neural network processor with programmable
-                 topology",
-  journal =      "IEEE Journal of Solid-State Circuits",
-  volume =       "26",
-  number =       "12",
-  pages =        "2017--2025",
-  month =        dec,
-  year =         "1991",
-}
-
-@InProceedings{Boser92,
-  author =       "Bernhard E. Boser and Isabelle M. Guyon and Vladimir N. Vapnik",
-  booktitle =    "Fifth Annual Workshop on Computational Learning
-                 Theory",
-  title =        "A training algorithm for optimal margin classifiers",
-  publisher =    "ACM",
-  address =      "Pittsburgh",
-  pages =        "144--152",
-  year =         "1992",
-  doi =          {http://doi.acm.org/10.1145/130385.130401},
-  isbn = {0-89791-497-X},
-}
-
-@incollection{bottou-bousquet-2008,
-  author = {Bottou, L\'{e}on and Bousquet, Olivier},
-  title = {The Tradeoffs of Large Scale Learning},
-  editor = NIPS20ed,
-  booktitle = NIPS20,
-  publisher = {MIT Press},
-  year = {2008},
-  volume = {20},
-  address = {Cambridge, MA},
-  url = "http://leon.bottou.org/papers/bottou-bousquet-2008",
-}
-
-@TechReport{Bottou+96,
-  author =       "L{\'e}on Bottou and Yoshua Bengio and Yann A. {Le Cun}",
-  title =        "Document Analysis with Generalized Transduction",
-  number =       "HA6156000-960701-01TM",
-  institution =  "AT\&T Laboratories",
-  address =      "Holmdel, New-Jersey",
-  month =        jul,
-  year =         "1996",
-}
-
-@Article{Bottou+LeCun05,
-  author =       "L{\'e}on Bottou and Yann {LeCun}",
-  title =        "Graph Transformer Networks for Image Recognition",
-  journal =      "Bulletin of the International Statistical Institute",
-  year =         "2005",
-}
-
-@TechReport{bottou-1996a,
-  author =       "L{\'{e}}on Bottou and Yoshua Bengio and Yann {Le Cun}",
-  title =        "Document Analysis with Transducers",
-  number =       "{960701}-{01}-{TM}",
-  institution =  "AT\&T Labs Technical Memorandum",
-  month =        jun,
-  year =         "1996",
-}
-
-@InProceedings{bottou-lecun-04b,
-  author =       "Leon Bottou and Yann {LeCun}",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Large-Scale On-Line Learning",
-  publisher =    "MIT Press",
-  year =         "2004",
-  original =     "orig/bottou-lecun-04b.ps.gz",
-}
-
-@InCollection{bottou-mlss-2004,
-  author =       "L\'{e}on Bottou",
-  editor =       "Olivier Bousquet and Ulrike von Luxburg",
-  booktitle =    "Advanced Lectures on Machine Learning",
-  title =        "Stochastic Learning",
-  number =       "LNAI 3176",
-  publisher =    "Springer Verlag",
-  address =      "Berlin",
-  pages =        "146--168",
-  year =         "2004",
-  series =       "Lecture Notes in Artificial Intelligence",
-  URL =          "http://leon.bottou.org/papers/bottou-mlss-2004",
-}
-
-@Article{Bottou90,
-  author =       "L. Bottou and F. Fogelman-Souli\'e and P. Blanchet and
-                 J. S. Lienard",
-  key =          "bottou",
-  title =        "Speaker independent isolated digit recognition:
-                 multilayer perceptrons vs dynamic time warping",
-  journal =      "Neural Networks",
-  volume =       "3",
-  pages =        "453--465",
-  year =         "1990",
-}
-
-@InProceedings{Bottou91,
-  author =       "L. Bottou and P. Gallinari",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "A Framework for the Cooperation of Learning
-                 Algorithms",
-  address =      "Denver, CO",
-  pages =        "781--788",
-  year =         "1991",
-}
-
-@Article{Bottou92,
-  author =       "L. Bottou and V. Vapnik",
-  key =          "Bottou92",
-  title =        "Local Learning Algorithms",
-  journal =      nc,
-  volume =       "4",
-  number =       "6",
-  pages =        "888--900",
-  year =         "1992",
-}
-
-@InProceedings{Bottou94,
-  author =       "L. Bottou and C. Cortes and J. S. Denker and H.
-                 Drucker and I. Guyon and L. D. Jackel and Y. {LeCun} and
-                 U. A. Muller and E. Sackinger and P. Simard and V.
-                 Vapnik",
-  booktitle =    "International Conference on Pattern Recognition",
-  title =        "Comparison of classifier methods: a case study in
-                 handwritten digit recognition",
-  address =      "Jerusalem, Israel",
-  year =         "1994",
-}
-
-@InProceedings{Bottou97,
-  author =       "L{\'e}on Bottou and Yoshua Bengio and Yann {LeCun}",
-  booktitle =    cvpr97,
-  title =        "Global Training of Document Processing Systems using
-                 Graph Transformer Networks",
-  publisher =    "IEEE",
-  address =      "Puerto Rico",
-  pages =        "490--494",
-  year =         "1997",
-}
-
-@InCollection{Bottou98,
-  author =       "L{\'e}on Bottou",
-  editor =       "David Saad",
-  booktitle =    "Online Learning in Neural Networks",
-  title =        "Online Algorithms and Stochastic Approximations",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge, UK",
-  pages =        "",
-  year =         "1998",
-}
-
-@PhdThesis{Bottou_these91,
-  author =       "L\'eon Bottou",
-  title =        "Une approche th\'eorique de l'apprentissage
-                 connexioniste; applications \`a la reconnaissance de la
-                 parole",
-  school =       "Universit\'e de Paris XI",
-  year =         "1991",
-}
-
-@InProceedings{BouchardG2004,
-  author =       "Guillaume Bouchard and Bill Triggs",
-  booktitle =    "IASC International Symposium on Computational
-                 Statistics (COMPSTAT)",
-  title =        "The Tradeoff Between Generative and Discriminative
-                 Classifiers",
-  address =      "Prague",
-  pages =        "721--728",
-  month =        aug,
-  year =         "2004",
-  keywords =     "LEAR, LAVA",
-}
-  %URL =          "http://lear.inrialpes.fr/pubs/2004/BT04",
-
-@inproceedings{BouchardG2007,
- author = {Guillaume Bouchard},
- title = {Bias-Variance Tradeoff in Hybrid Generative-Discriminative Models},
- booktitle = ICML07,
- editor =    ICML07ed,
- publisher = ICML07publ,
- year = {2007},
- isbn = {0-7695-3069-9},
- pages = {124--129},
- address = {Washington, DC, USA},
- }
- %doi = {http://dx.doi.org/10.1109/ICMLA.2007.23},
-
-@Article{Bourlard-cspla89,
-  author =       "H. Bourlard and C. Wellekens",
-  title =        "Speech Pattern Discrimination and Multi-Layered
-                 Perceptrons",
-  journal =      cspla,
-  volume =       "3",
-  pages =        "1--19",
-  year =         "1989",
-}
-
-@Article{Bourlard-pami90,
-  author =       "H. Bourlard and C. Wellekens",
-  title =        "Links between Hidden {Markov} Models and Multilayer
-                 Perceptrons",
-  journal =      ieeetpami,
-  volume =       "12",
-  pages =        "1167--1178",
-  year =         "1990",
-}
-
-@Article{Bourlard88,
-  author =       "H. Bourlard and Y. Kamp",
-  title =        "Auto-Association by Multilayer Perceptrons and
-                 Singular Value Decomposition",
-  journal =      biocyb,
-  volume =       "59",
-  pages =        "291--294",
-  year =         "1988",
-}
-
-@Book{Bourlard93,
-  author =       "H. Bourlard and N. Morgan",
-  title =        "Connectionist Speech Recognition. {A} Hybrid
-                 Approach",
-  volume =       "247",
-  publisher =    "Kluwer Academic Publishers",
-  address =      "Boston",
-  year =         "1993",
-  series =       "The Kluwer international series in engineering and
-                 computer science",
-}
-
-@Article{Bourlard_cspla89,
-  author =       "H Bourlard and C. Wellekens",
-  title =        "Speech Pattern Discrimination and Multi-Layered
-                 Perceptrons",
-  journal =      cspla,
-  volume =       "3",
-  pages =        "1--19",
-  year =         "1989",
-  OPTnote =      "",
-}
-
-@InCollection{Bourrely89,
-  author =       "J. Bourrely",
-  booktitle =    "Hypercube and distributed computers",
-  title =        "Parallelization of a Neural Learning Algorithm on a
-                 Hypercube",
-  publisher =    "Elsiever Science Publishing, North Holland",
-  pages =        "219--229",
-  year =         "1989",
-}
-
-@inproceedings{Bouveyron-Chipman-2007,
- author = {C. Bouveyron and H. Chipman},
- title = {Visualization and classification of graph-structured data: the case of the {E}nron dataset}, 
- booktitle = ijcnn,
- pages = {1506--1511}, 
- year = 2007,
-}
-
-@Book{Box73,
-  author =       "G. E. P. Box and G. C. Tiao",
-  title =        "Bayesian inference in statistical analysis",
-  publisher =    "Addison-Wesley",
-  year =         "1973",
-}
-
-@Book{BoxJenkins,
-  author =       "G. E. P. Box and G. M. Jenkins",
-  title =        "Time Series Analysis: Forecasting and Control.",
-  publisher =    "Holden-Day",
-  address =      "San Francisco",
-  year =         "1970",
-}
-
-@Book{Boyd04,
-  author =       "Stephen Boyd and Lieven Vandenberghe",
-  title =        "Convex Optimization",
-  publisher =    "Cambridge University Press",
-  address =      "New York, NY, USA",
-  year =         "2004",
-  ISBN =         "0-521-83378-7",
-}
-
-@incollection{Bradley+Bagnell-2009,
- title = {Differentiable Sparse Coding},
- author = {J. Andrew Bagnell and David M. Bradley},
- editor =       NIPS21ed,
- booktitle =    NIPS21,
- pages = {},
- publisher = {NIPS Foundation},
- year = {2009}
-}
-
-@PhdThesis{Bradley-thesis,
-  author =       "David Bradley",
-  title =        "Learning in Modular Systems",
-  school =       "The Robotics Institute, Carnegie Mellon University",
-  year =         "2009",
-}
-
-@Article{Brady-ieeecas89,
-  author =       "M. L. Brady and R. Raghavan and J. Slawny",
-  title =        "Back-Propagation Fails to Separate Where Perceptrons
-                 Succeed",
-  journal =      ieeetcas,
-  volume =       "36",
-  pages =        "665--674",
-  year =         "1989",
-}
-
-@Article{Brady89,
-  author =       "M. L. Brady and R. Raghavan and J. Slawny",
-  title =        "Back-Propagation fails to Separate Where Perceptrons
-                 Succeed",
-  journal =      "IEEE Transactions on Circuits and Systems",
-  volume =       "36",
-  number =       "5",
-  pages =        "665--674",
-  year =         "1989",
-}
-
-@InProceedings{Bramson90,
-  author =       "M. J. Bramson and R. G. Hoptroff",
-  booktitle =    "Workshop on Neural Networks for Statistical and
-                 Economic Data",
-  title =        "Forecasting the Economic Cycle: a Neural Network
-                 Approach",
-  address =      "Dublin",
-  year =         "1990",
-}
-
-@InProceedings{Brand2003,
-  author =       "M. Brand",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Charting a manifold",
-  publisher =    "{MIT} Press",
-  pages =        "961--968",
-  year =         "2003",
-}
-
-@Article{Brand99,
-  author =       "Matthew Brand",
-  title =        "Structure Learning in Conditional Probability Models
-                 via an Entropic Prior and Parameter Extinction",
-  journal =      "Neural Computation",
-  volume =       "11",
-  number =       "5",
-  pages =        "1155--1182",
-  year =         "1999",
-}
-
-@InProceedings{Brandt88,
-  author =       "R. D. Brandt and Y. Wang and A. J. Laub and S. K.
-                 Mitra",
-  booktitle =    icnn,
-  title =        "Alternative Networks for Solving the Travelling
-                 Salesman Problem and the List-Matching Problem",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "333--340",
-  year =         "1988",
-}
-
-@inproceedings{BreglerC1994,
-    author = "Christoph Bregler and Stephen M. Omohundro",
-    editor =       NIPS6ed,
-    booktitle =    NIPS6,
-    title = "Surface Learning with Applications to Lipreading",
-    publisher = "Morgan Kaufmann Publishers, Inc.",
-    pages = "43--50",
-    year = "1994",
-}
-
-
-@Article{Breiman-96,
-  author =       "L. Breiman",
-  title =        "Heuristics of instability and stabilization in model
-                 selection",
-  journal =      "The Annals of Statistics",
-  volume =       "24",
-  number =       "6",
-  pages =        "2350--2383",
-  year =         "1996",
-}
-
-@Article{breiman-stability-96,
-  author =       "L. Breiman",
-  title =        "Heuristics of Instability and Stabilization in Model
-                 Selection",
-  journal =      "Annals of Statistics",
-  volume =       "24",
-  number =       "6",
-  pages =        "2350--2383",
-  year =         "1996",
-}
-
-@Article{Breiman01,
-  author =       "Leo Breiman",
-  title =        "Random Forests",
-  journal =      "Machine Learning",
-  volume =       "45",
-  number =       "1",
-  pages =        "5--32",
-  year =         "2001",
-}
-
-@Book{Breiman84,
-  author =       "L. Breiman and J. H. Friedman and R. A. Olshen and C.
-                 J. Stone",
-  title =        "Classification and Regression Trees",
-  publisher =    "Wadsworth International Group",
-  address =      "Belmont, CA",
-  year =         "1984",
-}
-
-@TechReport{Breiman96,
-  author =       "L. Breiman",
-  title =        "Bias, Variance, and Arcing Classifiers",
-  number =       "Technical Report 460",
-  institution =  "Statistics Department, University of California",
-  address =      "Berkeley, CA 94720",
-  month =        apr,
-  year =         "1996",
-}
-
-@InCollection{Bridle+Cox91,
-  author =       "J. S. Bridle and S. J. Cox",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "{RECNORM}: simultaneous normalisation and
-                 classification applied to speech recognition",
-  publisher =    "Morgan Kaufmann",
-  pages =        "234--240",
-  year =         "1991",
-}
-
-@InCollection{Bridle89,
-  author =       "J. Bridle",
-  editor =       "F. Fogelman-Souli\'e and J. {H\'{e}rault}",
-  booktitle =    "Neuro-computing: Algorithms, Architectures, and
-                 Applications",
-  title =        "Probabilistic interpretation of feedforward
-                 classification network outputs, with relationships to
-                 statistical pattern recognition",
-  publisher =    "Springer-Verlag",
-  address =      "New York",
-  year =         "1989",
-}
-
-@InCollection{Bridle89-nips,
-  author =       "J. S. Bridle",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Training Stochastic Model Recognition Algorithms as
-                 Networks can lead to Maximum Mutual Information
-                 Estimation of Parameters",
-  publisher =    "Morgan Kaufmann",
-  pages =        "211--217",
-  year =         "1990",
-}
-
-@Article{Bridle90,
-  author =       "J. S. Bridle",
-  title =        "Alphanets: a Recurrent `Neural' Network Architecture
-                 with a Hidden {Markov} Model Interpretation",
-  journal =      spcomm,
-  volume =       "9",
-  number =       "1",
-  pages =        "83--92",
-  year =         "1990",
-}
-
-@InCollection{Bridle90b,
-  author =       "J. S. Bridle",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Training Stochastic Model Recognition Algorithms as
-                 Networks can lead to Maximum Mutual Information
-                 Estimation of Parameters",
-  publisher =    "Morgan Kaufmann",
-  pages =        "211--217",
-  year =         "1990",
-}
-
-@InCollection{Bromley-siamese93,
-  author =       "J. Bromley and J. Benz and L. Bottou and I. Guyon and
-                 L. Jackel and Y. {LeCun} and C. Moore and E. Sackinger
-                 and R. Shah",
-  booktitle =    "Advances in Pattern Recognition Systems using Neural
-                 Network Technologies",
-  title =        "Signature verification using a siamese time delay
-                 neural network",
-  publisher =    "World Scientific, Singapore",
-  pages =        "669--687",
-  year =         "1993",
-}
-
-@InCollection{Bromley93,
-  author =       "J. Bromley and J. Benz and L. Bottou and I. Guyon and
-                 L. Jackel and Y. {LeCun} and C. Moore and E. Sackinger
-                 and R. Shah",
-  booktitle =    "Advances in Pattern Recognition Systems using Neural
-                 Network Technologies",
-  title =        "Signature verification using a siamese time delay
-                 neural network",
-  publisher =    "Series in Machine Perception and Artificial
-                 Intelligence, World Scientific, Singapore",
-  pages =        "669--687",
-  year =         "1993",
-}
-
-@Article{broomhead-lowe-88,
-  author =       "D. Broomhead and D. Lowe",
-  key =          "Broomhead",
-  title =        "Multivariable functional interpolation and adaptive
-                 networks",
-  journal =      "Complex Systems",
-  volume =       "2",
-  pages =        "321--355",
-  year =         "1988",
-}
-
-@TechReport{Brown-Hinton-PoHMM-2000,
-  author =       "Andrew Brown and Geoffrey Hinton",
-  title =        "Products of Hidden Markov Models",
-  number =       "GCNU TR 2000-004",
-  institution =  "Gatsby Unit, University College London",
-  year =         "2000",
-}
-
-@Book{Brown86,
-  author =       "Lawrence D. Brown",
-  title =        "Fundamentals of Statistical Exponential Families",
-  volume =       "9",
-  publisher =    "Inst. of Math. Statist. Lecture Notes Monograph
-                 Series",
-  year =         "1986",
-}
-
-@Article{Brown92,
-  author =       "P. F. Brown and V. J. Della Pietra and P. V. DeSouza
-                 and J. C. Lai and R. L. Mercer",
-  title =        "Class-based {\it n}-gram models of natural language",
-  journal =      "Computational Linguistics",
-  volume =       "18",
-  pages =        "467--479",
-  year =         "1992",
-}
-
-@PhdThesis{BrownPhD,
-  author =       "P. Brown",
-  title =        "The Acoustic-Modeling problem in Automatic Speech
-                 Recognition",
-  school =       "Dept. of Computer Science, Carnegie-Mellon
-                 University",
-  year =         "1987",
-}
-
-@InProceedings{Bruce-94,
-  author =       "Rebecca Bruce and Janyce Wiebe",
-  booktitle =    "{ARPA} Workshop on Human Language Technology",
-  title =        "A new approach to sense identification",
-  address =      "Plainsboro, {NJ}",
-  year =         "1994",
-}
-
-@InProceedings{Brugnara92,
-  author =       "F. Brugnara and R. DeMori and D. Giuliani and M.
-                 Omologo",
-  booktitle =    icassp,
-  title =        "A family of parallel hidden Markov models",
-  publisher =    "IEEE",
-  address =      "New York, NY, USA",
-  pages =        "377--370",
-  year =         "1992",
-}
-
-@Article{Brunak89,
-  author =       "S. Brunak and B. Lautrup",
-  title =        "Liniedeling med et Neuralt Nev{\ae}rk",
-  journal =      SAML,
-  volume =       "14",
-  pages =        "55--74",
-  year =         "1989",
-}
-
-@Book{Brunak90,
-  author =       "S. Brunak and B. Lautrup",
-  title =        "Neural Networks: Computers with Intuition",
-  publisher =    "World Scientific",
-  address =      "Singapore",
-  year =         "1990",
-}
-
-@Article{Brunak91,
-  author =       "S. Brunak and J. Engelbrecht and S. Knudsen",
-  title =        "Prediction of human {mRNA} donor and acceptor sites
-                 from the {DNA} sequence",
-  journal =      "J. Molec. Biol.",
-  volume =       "220",
-  pages =        "49--65",
-  year =         "1991",
-}
-
-@Book{Bryson69,
-  author =       "A. E. Bryson and Y.-C. Ho",
-  title =        "Applied Optimal Control",
-  publisher =    "Blaisdell",
-  address =      "New York",
-  year =         "1969",
-}
-
-@Article{BT-the-fitting-1974,
-  author =       "A. E. Beaton and J. W. Tukey",
-  title =        "The fitting of power series, meaning polynomials,
-                 illustrted on band-spectroscopic data",
-  journal =      "Technometrics",
-  volume =       "16",
-  pages =        "147--185",
-  year =         "1974",
-}
-
-@article{Buia-Tiesinga-2006,
- author = {Calin Buia and Paul Tiesinga},
- title = {Attentional modulation of firing rate and synchrony in a model cortical network},
- journal = {J. Computational Neuroscience},
- volume = 20,
- pages = {247--264},
- year = 2006,
-}
-
-@TechReport{buhlmann97,
-  author =       "P. Buhlmann and A. J. Wyner",
-  title =        "Variable Length Markov Chains",
-  number =       "technical report 479",
-  institution =  "Statistics Department, University of California,
-                 Berkeley",
-  month =        jan,
-  year =         "1997",
-}
-
-@Article{Buhmann87,
-  author =       "J. Buhmann and K. Schulten",
-  title =        "Noise-Driven Temporal Association in Neural Networks",
-  journal =      eul,
-  volume =       "4",
-  pages =        "1205--1209",
-  year =         "1987",
-}
-
-@InProceedings{Buhmann88,
-  author =       "J. Buhmann and K. Schulten",
-  editor =       "R. Eckmiller and Ch. von der Malsburg",
-  booktitle =    "Neural Computers",
-  title =        "Storing Sequences of Biased Patterns in Neural
-                 Networks with Stochastic Dynamics",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Neuss 1987",
-  pages =        "231--242",
-  year =         "1988",
-}
-
-@Article{Buntine94,
-  author =       "W. Buntine",
-  title =        "Operations for Learning with Graphical Models",
-  journal =      "Journal of Artificial Intelligence Research",
-  volume =       "2",
-  pages =        "159--225",
-  year =         "1994",
-}
-
-@InProceedings{Burges92,
-  author =       "C. Burges and O. Matan and Y. {LeCun} and J. Denker and
-                 L. Jackel and C. Stenard and C. Nohl and J. Ben",
-  booktitle =    ijcnn,
-  title =        "Shortest Path Segmentation: {A} Method for Training a
-                 Neural Network to Recognize character Strings",
-  volume =       "3",
-  address =      "Baltimore",
-  pages =        "165--172",
-  year =         "1992",
-}
-
-@Article{Burges93,
-  author =       "C. J. C. Burges and J. I. Ben and J. S. Denker and Y.
-                 {LeCun} and C. R. Nohl",
-  title =        "Off Line Recognition of Handwritten Postal Words Using
-                 Neural Networks",
-  journal =      "International Journal of Pattern Recognition and
-                 Artificial Intelligence",
-  volume =       "7",
-  number =       "4",
-  pages =        "689",
-  year =         "1994",
-}
-
-@Article{burges98,
-  author =       "C. J. C. Burges",
-  title =        "A Tutorial on {Support} {Vector} {Machines} for
-                 Pattern Recognition",
-  journal =      "Data Mining and Knowledge Discovery",
-  volume =       "2",
-  number =       "2",
-  pages =        "1--47",
-  year =         "1998",
-}
-
-@InCollection{Burges99Geometry,
-  author =       "C. J. C. Burges",
-  editor =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
-  booktitle =    "Advances in Kernel Methods --- Support Vector
-                 Learning",
-  title =        "Geometry and invariance in kernel based methods",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "89--116",
-  year =         "1999",
-}
-
-@Article{Burr83,
-  author =       "D. J. Burr",
-  title =        "Designing a handwriting reader",
-  journal =      ieeetpami,
-  volume =       "5",
-  number =       "5",
-  pages =        "554--559",
-  month =        sep,
-  year =         "1983",
-}
-
-@InProceedings{Burr88,
-  author =       "D. J. Burr",
-  booktitle =    icnn,
-  title =        "An Improved Elastic Net Method for the Travelling
-                 Salesman Problem",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "69--76",
-  year =         "1988",
-}
-
-@Article{Burrows94,
-  author =       "J. H. Burrows and J. Peck",
-  title =        "On-Line Condition Monitoring of Rotating Equipment
-                 Using Neural Networks",
-  journal =      "ISA Transactions",
-  volume =       "33",
-  pages =        "159--164",
-  year =         "1994",
-}
-
-@InProceedings{Burrows95,
-  author =       "J. H. Burrows and R. Doucet",
-  booktitle =    "Proceedings of COMADEM'95",
-  title =        "Machine Condition Monitoring Using Artificial Neural
-                 Networks to Process Vibration Data Obtained from
-                 Maintenance Monitoring Equipment",
-  address =      "Kingston, Ontario, Canada",
-  year =         "1995",
-}
-
-@Article{Byrne87,
-  author =       "J. H. Byrne",
-  title =        "Cellular analysis of associative learning",
-  journal =      "Physiological Review",
-  volume =       "67",
-  pages =        "329--439",
-  year =         "1987",
-}
-
-@InCollection{Byrne89,
-  author =       "J. H. Byrne and K. J. Gingrich and D. A. Baxter",
-  editor =       "Hawkins R. D. and Bower G. H.",
-  booktitle =    "Computational Models of Learning in Simple Neural
-                 Systems",
-  title =        "Computational capabilities of single neurons:
-                 relationship to simple forms of associative and
-                 nonassociative learning in {\it Aplysia}",
-  publisher =    "Academic Press",
-  pages =        "31--63",
-  year =         "1989",
-}
-
-@InProceedings{Cacciatore-nips94,
-  author =       "T. W. Cacciatore and Steven J. Nowlan",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Mixtures of Controllers for Jump Linear and Non-linear
-                 Plants",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1994",
-}
-
-@Article{Cai94,
-  author =       "J. Cai",
-  title =        "A Markov model of unconditional variance in {ARCH}",
-  journal =      "Journal of Business and Economic Statistics",
-  year =         "1994",
-}
-
-@inproceedings{Cai+al-2007,
-    author = {Cai, Deng   and He, Xiaofei   and Han, Jiawei  },
-    booktitle = ICCV07,
-    pages = {1--7},
-    title = {Semi-supervised Discriminant Analysis},
-    year = {2007}
-}
-
-@Article{Caianiello61,
-  author =       "E. R. Caianiello",
-  title =        "Outline of a Theory of Thought and Thinking Machines",
-  journal =      jtb,
-  volume =       "1",
-  pages =        "204--235",
-  year =         "1961",
-}
-
-@article{Campbell+Kulikowski-1966,
-    author = {F. W. Campbell and J. J. Kulikowski},
-    title = {Orientational selectivity of the human visual system},
-    journal = {Journal of Physiology},
-    year = 1966,
-    pages = "437--445",
-    address = "London"
-}
-
-@article{Campbell+al-1969,
-    title = {The Spatial Selectivity of the Visual Cells of the Cat},
-    author = {F. W. Campbell and G. F. Cooper and Enroth C. Cugell},
-    journal = {Journal of Physiology},
-    address = "London",
-    pages = {223--235},
-    volume = {203},
-    year = {1969},
-    biburl = {http://www.bibsonomy.org/bibtex/2cfcc4bc8437b72761251fb2b9e7eb106/schaul},
-    description = {idsia},
-}
-
-@InBook{CandelaJ2006,
-  author =       "J. Quiñonero Candela and C. E. Rasmussen and F. Sinz
-                 and O. Bousquet and B. Schölkopf",
-  booktitle =    "Machine learning challenges: Evaluating predictive
-                 uncertainty, visual object classification, and
-                 recognising textual entailment",
-  title =        "Evaluating Predictive Uncertainty Challenge",
-  publisher =    "Springer",
-  address =      "Heidelberg, Germany",
-  pages =        "1--27",
-  month =        apr,
-  year =         "2006",
-  series =       "Lecture Notes in Computer Science: 3944",
-  URL =          "http://www.springerlink.com/(yxluatzjo3gnpl45323wjs45)/app/home/contribution.asp?referrer=parent&amp;amp;amp;amp;amp;amp;amp;backto=issue,1,25;journal,2,3638;linkingpublicationresults,1:105633,1",
-  abstract =     "This Chapter presents the PASCAL1 Evaluating
-                 Predictive Uncertainty Challenge, introduces the
-                 contributed Chapters by the participants who obtained
-                 outstanding results, and provides a discussion with
-                 some lessons to be learnt. The Challenge was set up to
-                 evaluate the ability of Machine Learning algorithms to
-                 provide good Èprobabilistic predictionsÉ, rather than
-                 just the usual Èpoint predictionsÉ with no measure of
-                 uncertainty, in regression and classification problems.
-                 Parti-cipants had to compete on a number of regression
-                 and classification tasks, and were evaluated by both
-                 traditional losses that only take into account point
-                 predictions and losses we proposed that evaluate the
-                 quality of the probabilistic predictions.",
-  OPTeditor =    "Quiñonero Candela, J., I. Dagan, B. Magnini, F. DAlché
-                 Buc",
-}
-
-@article{candeswakin08,
-author = "Candes, E. and Wakin, M.",
-title = "An introduction to compressive sampling",
-journal = "IEEE Signal Processing Magazine",
-volume = 21,
-year = 2008,
-}
-
-@article{Candes+Tao-2005,
- author = {E.J. Candes and T. Tao},
- title = {Decoding by linear programming},
- journal = {{IEEE} Transactions on Information Theory},
- volume = 51,
- number = 12,
- pages = {4203--4215},
- year = 2005,
-}
-
-@Article{Canning88,
-  author =       "A. Canning and E. Gardner",
-  title =        "Partially Connected Models of Neural Networks",
-  journal =      jpa,
-  volume =       "21",
-  pages =        "3275--3284",
-  year =         "1988",
-}
-
-@article{carandini:1994,
-    author = {Matteo Carandini and David J. Heeger},
-    title = {Summation and Division by Neurons in Primate Visual Cortex},
-    journal = {Science},
-    volume={264},
-    number={5163},
-    month = {May},
-    year = {1994},
-    pages = {1333-1336},
-}
-
-@inproceedings{Cardie-1993,
-    author = "Claire Cardie",
-    title = "Using Decision Trees to Improve Case--Based Learning",
-    booktitle = "Proceedings of the Tenth International Conference on Machine Learning",
-    publisher = "Morgan Kaufmann",
-    pages = "25--32",
-    year = "1993",
-    url = "citeseer.ist.psu.edu/cardie93using.html"
-}
-
-@Article{Carpenter87a,
-  author =       "G. A. Carpenter and S. Grossberg",
-  title =        "A Massively Parallel Architecture for a
-                 Self-Organizing Neural Pattern Recognition Machine",
-  journal =      cvgip,
-  volume =       "37",
-  pages =        "54--115",
-  year =         "1987",
-}
-
-@Article{Carpenter87b,
-  author =       "G. A. Carpenter and S. Grossberg",
-  title =        "{ART2}: Self-Organization of Stable Category
-                 Recognition Codes for Analog Input Patterns",
-  journal =      applopt,
-  volume =       "26",
-  pages =        "4919--4930",
-  year =         "1987",
-}
-
-@Article{Carpenter88,
-  author =       "G. A. Carpenter and S. Grossberg",
-  title =        "The {ART} of Adaptive Pattern Recognition by a
-                 Self-Organizing Neural Network",
-  journal =      computer,
-  pages =        "77--88",
-  month =        mar,
-  year =         "1988",
-}
-
-@InProceedings{Carrasco94,
-  author =       "R. C. Carrasco and J. Oncina",
-  booktitle =    "Grammatical Inference and Applications Proc. of the
-                 2nd International Colloquium on Grammatical Inference
-                 ICGI94",
-  title =        "Learning regular grammars by means of a state merging
-                 method",
-  publisher =    "Lecture Notes in Artificial Intelligence 862",
-  address =      "Alicante (Spain)",
-  month =        sep,
-  year =         "1994",
-}
-
-@Article{Carter94,
-  author =       "C. K. Carter and R. Kohn",
-  title =        "On Gibbs sampling for state space models",
-  journal =      "Biometrika",
-  volume =       "81",
-  pages =        "541--553",
-  year =         "1994",
-}
-
-@InProceedings{Caruana-2001,
-  author =       "Rich Caruana",
-  booktitle =    aistats01,
-  title =        "A Non-Parametric {EM}-Style Algorithm for Imputing
-                 Missing Values",
-  publisher =    "Society for Artificial Intelligence and Statistics",
-  year =         "2001",
-}
-
-@InProceedings{caruana06:empirical,
-  author =       "R. Caruana and A. Niculescu-Mizil",
-  booktitle =    ICML06,
-  editor =       ICML06ed,
-  publisher =    ICML06publ,
-  title =        "An Empirical Comparison of Supervised Learning
-                 Algorithms",
-  year =         "2006",
-}
-
-@InProceedings{caruana93a,
-  author =       "Rich Caruana",
-  booktitle =    "Proceedings of the 1993 Connectionist Models Summer
-                 School",
-  title =        "Multitask Connectionist Learning",
-  pages =        "372--379",
-  year =         "1993",
-}
-
-@InProceedings{caruana93a-small,
-  author =       "Rich Caruana",
-  booktitle =    "Proceedings of the 1993 Connectionist Models Summer
-                 School",
-  title =        "Multitask Connectionist Learning",
-  pages =        "372--379",
-  year =         "1993",
-}
-  %url =          "http://citeseer.ist.psu.edu/32984.html",
-
-@InProceedings{caruana95,
-  author =       "Rich Caruana",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        "Learning Many Related Tasks at the Same Time With
-                 Backpropagation",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "657--664",
-  year =         "1995",
-}
-
-@InProceedings{caruana96,
-  author =       "Rich Caruana and Shumeet Baluja and Tom Mitchell",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Using the Future to ``Sort Out'' the Present: Rankprop
-                 and Multitask Learning for Medical Risk Evaluation",
-  publisher =    "",
-  address =      "",
-  pages =        "",
-  year =         "1996",
-}
-
-@InProceedings{caruana96c,
-  author =       "Rich Caruana",
-  booktitle =    "International Conference on Machine Learning",
-  title =        "Algorithms and Applications for Multitask Learning",
-  pages =        "87--95",
-  year =         "1996",
-}
-
-@Article{caruana97a,
-  author =       "Rich Caruana",
-  title =        "Multitask Learning",
-  journal =      "Machine Learning",
-  volume =       "28",
-  number =       "1",
-  publisher =    "Kluwer Academic Publishers",
-  address =      "Hingham, MA, USA",
-  pages =        "41--75",
-  year =         "1997",
-}
-
-@Article{Casdagli89,
-  author =       "M. Casdagli",
-  title =        "Nonlinear Prediction of Chaotic Time Series",
-  journal =      physicaD,
-  volume =       "35",
-  pages =        "335--356",
-  year =         "1989",
-}
-
-@book{Casella+Berger-2001,
- author = {George Casella and Roger Berger},
- title = {Statistical Inference},
- publisher = {Duxbury Press},
- year = 2001,
-}
-
-
-@Article{Cashman+Pouliot90,
-  author =       "N. R. Cashman and Y. Pouliot",
-  title =        "{EBV} {Ig}-like domains",
-  journal =      "Nature",
-  volume =       "343",
-  pages =        "319",
-  year =         "1990",
-}
-
-@ARTICLE{CataltepeZ1999,
-    author = {Zehra Cataltepe and Yaser S. Abu-mostafa and Malik Magdon-ismail},
-    title = {No free lunch for early stopping},
-    journal = {Neural Computation},
-    year = {1999},
-    volume = {11},
-    pages = {995--1009}
-}
-
-@InProceedings{Cater87,
-  author =       "J. P. Cater",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Successfully Using Peak Learning Rates of 10 (and
-                 Greater) in Back-Propagation Networks with the
-                 Heuristic Learning Algorithm",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "645--651",
-  year =         "1987",
-}
-
-@Book{Caudill89,
-  author =       "M. Caudill",
-  title =        "Neural Networks Primer",
-  publisher =    "Miller Freeman",
-  address =      "San Francisco",
-  year =         "1989",
-}
-
-@Manual{CC01a,
-  author =       "Chih-Chung Chang and Chih-Jen Lin",
-  title =        "{LIBSVM}: a library for support vector machines",
-  year =         "2001",
-  note =         "Software available at
-                 \verb+http://www.csie.ntu.edu.tw/~cjlin/libsvm+",
-}
-
-@Article{cemgil+kappen+barber-2006,
-  author =       "A. T. Cemgil and H. J. Kappen and D. Barber",
-  title =        "A Generative Model for Music Transcription",
-  journal =      "IEEE Transactions on Audio, Speech and Language
-                 Processing",
-  volume =       "14",
-  number =       "2",
-  pages =        "679--694",
-  year =         "2006",
-}
-
-@inproceedings{Cevikalp+al-2008,
-    title = {Semi-Supervised Dimensionality Reduction Using Pairwise Equivalence Constraints},
-    author = {Hakan Cevikalp and Jakob J. Verbeek and Frédéric Jurie and Alexander Kläser},
-    booktitle = {VISAPP},
-    editor = {Alpesh Ranchordas and Helder Araújo},
-    pages = {489-496},
-    publisher = {INSTICC - Institute for Systems and Technologies of Information, Control and Communication},
-    url = {http://dblp.uni-trier.de/db/conf/visapp/visapp2008-1.html#CevikalpVJK08},
-    year = {2008},
-    biburl = {http://www.bibsonomy.org/bibtex/21afc498c02543e97ff5bd4f6b107e16e/dblp},
-    description = {dblp},
-    isbn = {978-989-8111-21-0},
-    date = {2008-04-07},
-    keywords = {dblp }
-}
-
-@InProceedings{CGY96,
-    author =       "Ingemar J. Cox and Joumana Ghosn and Peter N.
-                 Yianilos",
-  booktitle =    cvpr96,
-  title =        "Feature-Based Face Recognition Using
-                 Mixture-Distance",
-  pages =        "209--216",
-  year =         "1996",
-}
-
-@Article{CHAID-BVS-91,
-  author =       "D. Biggs and B. Ville and E. Suen",
-  title =        "A method of choosing multiway partitions for
-                 classification and decision trees",
-  journal =      "Journal of Applied Statistics",
-  volume =       "18",
-  number =       "1",
-  pages =        "49--62",
-  year =         "1991",
-}
-
-@InBook{CHAID-HK-82,
-  author =       "D. M. Hawkins and G. V. Kass",
-  booktitle =    "Topics in Applied Multivariate Analysis",
-  title =        "Automatic Interaction Detection",
-  publisher =    "Cambridge, Cambridge University Press",
-  pages =        "269--302",
-  year =         "1982",
-}
-
-@Article{CHAID-original-80,
-  author =       "G. V. Kass",
-  title =        "An Exploratory Technique for Investigating Large
-                 Quantities of Categorical Data",
-  journal =      "Applied Statistics",
-  volume =       "29",
-  number =       "2",
-  pages =        "119--127",
-  year =         "1980",
-}
-
-@InProceedings{Chapados2002,
-  author =       "N. Chapados and Y. Bengio and P. Vincent and J. Ghosn
-                 and C. Dugas and I. Takeuchi and L. Meng",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Estimating Car Insurance Premia: a Case Study in
-                 High-Dimensional Data Inference",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =        "1369--1376",
-  year =         "2002",
-}
-
-@InProceedings{Chapados2002-short,
-  author =       "N. Chapados and Y. Bengio and P. Vincent and J. Ghosn
-                 and C. Dugas and I. Takeuchi and L. Meng",
-  booktitle =    NIPS14,
-  title =        "Estimating Car Insurance Premia: a Case Study in
-                 High-Dimensional Data Inference",
-  publisher =    "{MIT} Press",
-  year =         "2002",
-}
-
-@InProceedings{Chapelle+al-2003,
-  author =       "O. Chapelle and J. Weston and B. Sch{\"o}lkopf",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Cluster kernels for semi-supervised learning",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =         {585--592},
-  year =         "2003",
-}
-
-@InProceedings{Chapelle-nips2003,
-  author =       "O. Chapelle and B. Sch{\"o}lkopf and J. Weston",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Semi-supervised learning through principal directions
-                 estimation",
-  publisher =    "{MIT} Press",
-  year =         "2003",
-}
-
-@InProceedings{Chapelle2001,
-  author =       "Olivier Chapelle and Jason Weston and L\'eon Bottou
-                 and Vladimir Vapnik",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "Vicinal Risk Minimization",
-  pages =        "416--422",
-  year =         "2001",
-}
-
-@InProceedings{chapelle2001iin,
-  author =       "O. Chapelle and B. Scholkopf",
-  title =        "{Incorporating invariances in nonlinear support vector
-                 machines}",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  volume =       "14",
-  year =         "2001",
-}
-
-@Article{Chapelle99,
-  author =       "O. Chapelle and P. Haffner and V. Vapnik",
-  title =        "{SVM}s for Histogram-Based Image Classification",
-  journal =      "IEEE Transactions on Neural Networks",
-  year =         "1999",
-  note =         "accepted, special issue on Support Vectors",
-}
-
-@Article{ChapelleVapnikBengio2001,
-  author =       "O. Chapelle and V. Vapnik and Y. Bengio",
-  title =        "Model Selection for Small-Sample Regression",
-  journal =      "Machine Learning Journal",
-  volume =       "48",
-  number =       "1",
-  pages =        "9--23",
-  year =         "2002",
-}
-
-@inproceedings{Willski-2002,
-  author =       "A.S. Willsky",
-  title =        "Multiresolution {Markov} models for signal and image processing",
-  booktitle =    "Proceedings of the IEEE",
-  volume =       "90",
-  number =       "8",
-  pages =        "1396--1458",
-  year =         "2002",
-}
-
-@Article{Felzenszwalb+Huttenlocher-2004,
-  author =       "Pedro F. Felzenszwalb and Daniel P. Huttenlocher",
-  title =        "Efficient Graph-Based Image Segmentation",
-  journal =      "Intl. Journal of Computer Vision",
-  volume =       "59",
-  number =       "2",
-  pages =        "167-181",
-  year =         "2004",
-}
-
-@inproceedings{Lombaert-2005,
-  author =       "Herve Lombaert and Yiyong Sun and Leo Grady and Chenyang Xu",
-  title =        "A Multilevel Banded Graph Cuts Method for Fast Image Segmentation",
-  booktitle =    ICCV05,
-  volume =       "1",
-  pages =        "259-265",
-  year =         "2005",
-}
-
-@Article{Boykov+Kolmogorov-2004,
-  author =       "Y. Boykov and V. Kolmogorov",
-  title =        "An experimental comparison of min-cut/max-flow algorithms for energy minimization in vision",
-  journal =      ieeetpami,
-  volume =       "26",
-  number =       "9",
-  pages =        "1124-1137",
-  year =         "2004",
-}
-
-@inproceedings{chapelleetal06,
-author = "Chapelle, O. and Chi, M. and Zien, A.",
-title = "A continuation method for semi-supervised {SVMs}",
-booktitle = ICML06,
-editor =    ICML06ed,
-publisher = ICML06publ,
-year = 2006,
-}
-
-@inproceedings{ChapelleO2005,
-   author = {Olivier Chapelle and Alexander Zien},
-   title = {Semi-Supervised Classification by Low Density Separation},
-   year = {2005},
-   pages = {57-64},
-   month = {01},
-   journal = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS 2005)},
-   editor = {Cowell, R. , Z. Ghahramani},
-   booktitle = {Tenth International Workshop on Artificial Intelligence and Statistics},
-   location = {Barbados},
-}
-   %URL = {http://www.gatsby.ucl.ac.uk/aistats/aistats2005_eproc.pdf}
-
-@book{Chapelle-2006,
- author = {Olivier Chapelle and Bernhard Sch{\"o}lkopf and Alexander Zien},
- title =    "Semi-Supervised Learning",
- publisher =    "{MIT} Press",
- year =         "2006",
-}
-
-@TechReport{Charniak99,
-  author =       "Eugene Charniak",
-  title =        "A Maximum-Entropy-Inspired Parser",
-  number =       "CS-99-12",
-  institution =  "Brown University",
-  year =         "1999",
-  URL =          "citeseer.nj.nec.com/charniak99maximumentropyinspired.html",
-}
-
-@misc{Chatpatanasiri-2008,
-    author = {Ratthachat Chatpatanasiri},
-    title = {Spectral Methods for Linear and Non-Linear Semi-Supervised Dimensionality Reduction},
-    url = {http://www.citebase.org/abstract?id=oai:arXiv.org:0804.0924},
-    year = {2008},
-    note = {Submitted for publication},
-}
-
-@InProceedings{Chauvin89,
-  author =       "Y. Chauvin",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "A Back-Propagation Algorithm with Optimal Use of
-                 Hidden Units",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "519--526",
-  year =         "1989",
-}
-
-@InProceedings{Chauvin90,
-  author =       "Y. Chauvin",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Dynamic behavior of constrained back-propagation
-                 networks",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "642--649",
-  year =         "1990",
-}
-
-@InProceedings{Cheeseman88,
-  author =       "P. Cheeseman and J. Kelly and M. Self and J. Stutz and
-                 W. Taylor and D. Freeman",
-  booktitle =    "Proceedings of the Fifth International Conference on
-                 Machine Learning",
-  title =        "{AutoClass}: {A} {Bayesian} Classification System",
-  address =      "The University of Michigan, Ann Arbor",
-  month =        jun,
-  year =         "1988",
-}
-
-@Article{Chelba-Jelinek-2000,
-  author =       "Ciprian Chelba and Frederick Jelinek",
-  title =        "Structured Language Modeling",
-  journal =      "Computer, Speech and Language",
-  volume =       "14",
-  number =       "4",
-  pages =        "282--332",
-  year =         "2000",
-}
-
-@Article{Chen+Goodman99,
-  author =       "Stanley F. Chen and Joshua T. Goodman.",
-  title =        "An Empirical Study of Smoothing Techniques for
-                 Language Modeling",
-  journal =      "Computer, Speech and Language",
-  volume =       "13",
-  number =       "4",
-  pages =        "359--393",
-  year =         "1999",
-}
-
-@Article{Chen+Murray2003,
-  author =       "Hsin Chen and Alan F. Murray",
-  title =        "A Continuous Restricted {Boltzmann} Machine with an
-                 Implementable Training Algorithm",
-  journal =      "IEE Proceedings of Vision, Image and Signal
-                 Processing",
-  volume =       "150",
-  number =       "3",
-  pages =        "153--158",
-  year =         "2003",
-}
-
-@PhdThesis{chen95basispursuit,
-  author =       "S. Chen",
-  title =        "Basis Pursuit",
-  school =       "Department of Statistics, Stanford University",
-  year =         "1995",
-}
-
-@TechReport{Chen98,
-  author =       "Stanley F. Chen and Joshua T. Goodman.",
-  title =        "An Empirical Study of Smoothing Techniques for
-                 Language Modeling",
-  number =       "TR-10-98",
-  institution =  "Computer Science Group, Harvard University",
-  year =         "1998",
-}
-
-@Article{ChenS2000,
-  author =       "Stanley F. Chen and Ronald Rosenfeld",
-  title =        "A Survey of Smoothing Techniques fo {ME} Models",
-  journal =      "IEEE Transactions on Speech and Audio Processing",
-  volume =       "8",
-  number =       "1",
-  month =        jan,
-  year =         "2000",
-}
-
-@techreport{Chen+Kotani-2005,
-  author =      "Chen, Fan and Kotani, Kazunori",
-  title =       "Facial Expression Recognition by Supervised {ICA} with Selective Prior",
-  ISSN =        "09135685",
-  institution = "The Institute of Electronics, Information and Communication Engineers",
-  year =        "2005",
-  number =      "462",
-  pages =       "27-32",
-  URL =         "http://ci.nii.ac.jp/naid/110004064718/en/",
-}
-
-@Article{ChenX1989,
-  author={Chen, X. R. and Krishnaiah, P. R. and Liang, W. W.},
-  title={Estimation of multivariate binary density using orthogonal functions},
-  journal={Journal of Multivariate Analysis},
-  year=1989,
-  volume={31},
-  number={2},
-  pages={178-186},
-  month={November},
-}
-
-@InProceedings{Chigier88,
-  author =       "B. Chigier and R. A. Brennan",
-  booktitle =    icassp,
-  title =        "Broad Class Network Generation Using a Combination of
-                 Rules and Statistics for Speaker Independent Continuous
-                 Speech",
-  address =      "New York, NY",
-  pages =        "449--452",
-  year =         "1988",
-}
-
-@InCollection{Chipman-NIPS2006,
-  author =       "H. A. Chipman and E. I. George and R. E. McCulloch",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Bayesian Ensemble Learning",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2007",
-}
-
-@article{Chipman-2008,
-  author =       "H. A. Chipman and E. I. George and R. E. McCulloch",
-  title =        "Bayesian Ensemble Learning",
-  journal = "Annals of Applied Statistics",
-  year =         "2008",
-  editors =      "under revision",
-}
-
-@InProceedings{ChopraS2005,
-  author =       "Sumit Chopra and Raia Hadsell and Yann {LeCun}",
-  booktitle =    cvpr05,
-  title =        "Learning a Similarity Metric Discriminatively, with
-                 Application to Face Verification",
-  publisher =    "IEEE Press",
-  year =         "2005",
-  original =     "orig/chopra-05.ps.gz",
-}
-
-@InProceedings{Choueka-1998,
-  author =       "Y. Choueka",
-  booktitle =    "RIAO 88, User-oriented Content-based Text and Image
-                 Handling",
-  title =        "Looking for needles in a haystack",
-  volume =       "1",
-  pages =        "609--623",
-  year =         "1988",
-}
-
-@Article{Chow62,
-  author =       "C. K. Chow",
-  title =        "A recognition method using neighbor dependence",
-  journal =      "IRE Trans. Elec. Comp.",
-  volume =       "EC-11",
-  pages =        "683--690",
-  month =        oct,
-  year =         "1962",
-}
-
-@InProceedings{Chrisman92AAAI,
-  author =       "Lonnie Chrisman",
-  booktitle =    AAAI-92,
-  title =        "Reinforcement Learning with Perceptual Aliasing: The
-                 Perceptual Distinctions Approach",
-  pages =        "183--188",
-  year =         "1992",
-}
-
-@InProceedings{Chung+al-1998,
-  author =       "Yi-Ming Chung and William M. Pottenger and Bruce R.
-                 Schatz",
-  booktitle =    "DL '98: Proceedings of the third ACM conference on
-                 Digital libraries",
-  title =        "Automatic subject indexing using an associative neural
-                 network",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "59--68",
-  year =         "1998",
-  ISBN =         "0-89791-965-3",
-  location =     "Pittsburgh, Pennsylvania, United States",
-}
-
-@InProceedings{Chung-97,
-  author =       "F. Chung",
-  booktitle =    "{CBMS} Regional Conference Series",
-  title =        "Spectral graph theory",
-  volume =       "92",
-  publisher =    "American Mathematical Society",
-  year =         "1997",
-}
-
-@Article{Churchill89,
-  author =       "G. A. Churchill",
-  title =        "A stochastic model for heterogeneous {DNA} sequences",
-  journal =      "Bull. Mathematical Biology",
-  volume =       "51",
-  pages =        "79--94",
-  year =         "1989",
-}
-
-@Book{Chvatal83,
-  author =       "V. Chv\'atal",
-  title =        "Linear Programming",
-  publisher =    "W. H. Freeman",
-  address =      "",
-  year =         "1983",
-}
-
-@Article{Cleeremans89,
-  author =       "A. Cleeremans and D. Servan-Schreiber and J. L.
-                 McClelland",
-  title =        "Finite State Automata and Simple Recurrent Networks",
-  journal =      nc,
-  volume =       "1",
-  pages =        "372--381",
-  year =         "1989",
-}
-
-@InCollection{Clifford-1990,
-  author = {Peter Clifford},
-  title = {Markov random Fields in statistics}, 
-  editor = {Geoffrey Grimmett and Dominic Welsh}, 
-  booktitle = {Disorder in Physical Systems: A Volume in Honour
-of John M. Hammersley}, 
-  pages = {19--32}, 
-  publisher = {Oxford University Press}, 
-  year = 1990,
-}
-
-@Book{CLM,
-  author =       "J. Y. Campbell and A. W. Lo and A. C. MacKinlay",
-  title =        "The Econometrics of Financial Markets",
-  publisher =    "Princeton University Press",
-  address =      "Princeton",
-  year =         "1997",
-}
-
-@Book{CND04,
-  author =       "{Congr\'egation de Notre-Dame}",
-  title =        "La cuisine raisonnée",
-  publisher =    "Fides",
-  year =         "2004",
-  ISBN =         "2-7621-2083-7",
-}
-
-@InProceedings{Cloutier96,
-  author =       "J. Cloutier and E. Cosatto and S. Pigeon and F. R.
-                 Boyer and P. Y. Simard",
-  booktitle =    "Fifth International Conference on Microelectronics for
-                 Neural Networks and Fuzzy Systems",
-  title =        "{VIP}: and {FPGA}-based processor for image processing
-                 and neural networks",
-  year =         "1996",
-  note =         "submitted",
-}
-
-@Manual{CMFortran,
-  author =       "",
-  key =          "TMC",
-  title =        "{CM} Fortran. Programming Guide",
-  organization = "Thinking Machine Corporation",
-  address =      "Cambridge, MA",
-  edition =      "1.1",
-  month =        jan,
-  year =         "1991",
-}
-
-@Article{Cohen83,
-  author =       "M. A. Cohen and S. Grossberg",
-  title =        "Absolute Stability of Global Pattern Formation and
-                 Parallel Memory Storage by Competitive Neural
-                 Networks",
-  journal =      ieeesmc,
-  volume =       "13",
-  pages =        "815--826",
-  year =         "1983",
-}
-
-@Article{Cohen86,
-  author =       "M. S. Cohen",
-  title =        "Design of a New Medium for Volume Holographic
-                 Information Processing",
-  journal =      applopt,
-  volume =       "25",
-  pages =        "2228--2294",
-  year =         "1986",
-}
-
-@Article{Cohen89,
-  author =       "J. R. Cohen",
-  title =        "Application of an auditory model to speech
-                 recognition",
-  journal =      "Journal of the Acoustical Society of America",
-  volume =       "85",
-  number =       "6",
-  pages =        "2623--2629",
-  year =         "1989",
-}
-
-@PhdThesis{Cohn-PhD,
-  author =       "D. Cohn",
-  title =        "Separating Formal Bounds from Practical Performance in
-                 Learning Systems",
-  school =       "University of Washington",
-  year =         "1992",
-}
-
-@InProceedings{Cohn95,
-  author =       "David Cohn and Zoubin Ghahramani and Michael I.
-                 Jordan",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        "Active learning with statistical models",
-  publisher =    "Cambridge MA: MIT Press",
-  year =         "1995",
-  pages = {705--712}
-}
-
-@InProceedings{Cohn95-small,
-  author =       "David Cohn and Zoubin Ghahramani and Michael I.
-                 Jordan",
-  editor =       NIPS7ed,
-  booktitle =    "Advances in NIPS 7",
-  title =        "Active learning with statistical models",
-  publisher =    "Cambridge MA: MIT Press",
-  year =         "1995",
-}
-
-@InProceedings{Cohn95-short,
-  author =       "D. Cohn and Z. Ghahramani and M.I.
-                 Jordan",
-  booktitle =    "Adv. Neural Inf. Proc. Sys. 7",
-  title =        "Active learning with statistical models",
-  year =         "1995",
-  pages = {705--712}
-}
-
-@InProceedings{Cole+Hou88,
-  author =       "R. A. Cole and L. Hou",
-  booktitle =    icassp,
-  title =        "Segmentation and Broad Classification of Continuous
-                 Speech",
-  address =      "New York, NY",
-  pages =        "453--452",
-  year =         "1988",
-}
-
-@Book{Cole96,
-  author =       "R. A. Cole and J. Mariani and H. Uszkoriet and A.
-                 Zaenen and V. Zue",
-  title =        "Survey of the State of the Art in Human Language
-                 Technology",
-  publisher =    "Cambridge University Press",
-  address =      "http://www.cse.ogi.edu/CSLU/HLTsurvey/HLTsurvey.html",
-  year =         "1996",
-}
-
-@TechReport{Coleman+Wu-1994,
-  author =       "Thomas F. Coleman and Zhijun Wu",
-  title =        "Parallel continuation-based global optimization for
-                 molecular conformation and protein folding",
-  institution =  "Cornell University, Dept. of Computer Science",
-  year =         "1994",
-}
-
-@TechReport{Coleman+Wu-1994-short,
-  author =       "T.F. Coleman and Z. Wu",
-  title =        "Parallel continuation-based global optimization for
-                 molecular conformation and protein folding",
-  institution =  "Cornell University, Dept. of Computer Science",
-  year =         "1994",
-}
-
-@TechReport{Collins89,
-  author =       "S. {Collins, E. Ghosh} and C. Scofield",
-  title =        "An application of a multiple neural network learning
-                 system to emulation of mortgage underwriting
-                 judgements",
-  institution =  "Nestor Inc.",
-  address =      "Providence, RI",
-  year =         "1989",
-}
-
-@InProceedings{Collins96,
-  author =       "M. Collins",
-  booktitle =    "34th Annual Meeting of the {ACL}",
-  title =        "A new statistical parser based on bigram lexical
-                 dependencies",
-  pages =        "184--191",
-  year =         "1996",
-}
-
-@InProceedings{Collins97,
-  author =       "M. Collins",
-  booktitle =    "35th Annual Meeting of the {ACL}",
-  title =        "Three generative, lexicalized models for statistical
-                 parsing",
-  address =      "Madrid, Spain",
-  pages =        "16--23",
-  year =         "1997",
-}
-
-@PhdThesis{Collins99,
-  author =       "M. Collins",
-  title =        "Head-driven statistical models for natural language
-                 parsing",
-  school =       "University of Pennsylvania",
-  year =         "1999",
-}
-
-@InProceedings{Collobert-2006,
-  author =       "R. Collobert and F. Sinz and J. Weston and L. Bottou",
-  booktitle =    "Proceedings of the 23rd International Conference on
-                 Machine Learning",
-  title =        "Trading Convexity for Scalability",
-  pages =        "",
-  year =         "2006",
-}
-
-@PhdThesis{Collobert04,
-  author =       "R. Collobert",
-  title =        "Large Scale Machine Learning",
-  school =       "Universit\'e de Paris VI, LIP6",
-  year =         "2004",
-}
-
-@Article{Collobert2002,
-  author =       "R. Collobert and S. Bengio and Y. Bengio",
-  title =        "Parallel Mixture of {SVM}s for Very Large Scale
-                 Problems",
-  journal =      "Neural Computation",
-  volume =       "14",
-  number =       "5",
-  pages =        "1105--1114",
-  year =         "2002",
-}
-
-@InProceedings{Collobert2004,
-  author =       "Ronan Collobert and Samy Bengio",
-  booktitle =    ICML04,
-  editor =       ICML04ed,
-  publisher =    ICML04publ,
-  title =        "Links between perceptrons, {MLP}s and {SVM}s",
-  address =      "New York, NY, USA",
-  year =         "2004",
-  location =     "Banff, Alberta, Canada",
-  isbn =         "1-58113-828-5",
-  pages =        "23",
-  location =     "Banff, Alberta, Canada",
-  doi =          "http://doi.acm.org/10.1145/1015330.1015415",
-}
-
-@InProceedings{CollobertR2008,
-  author =       "Ronan Collobert and Jason Weston",
-  booktitle =    ICML08,
-  editor =       ICML08ed,
-  publisher =    ICML08publ,
-  title =        "A Unified Architecture for Natural Language
-                 Processing: Deep Neural Networks with Multitask
-                 Learning",
-  year =         "2008",
-  pages =       "160-167",
-}
-  %url =          "http://www.kyb.tuebingen.mpg.de/bs/people/weston/papers/unified\-nlp.pdf",
-
-@InProceedings{CollobertR2008-small,
-  author =       "R. Collobert and J. Weston",
-  booktitle =    "ICML 2008",
-  title =        "A Unified Architecture for Natural Language
-                 Processing: Deep Neural Networks with Multitask
-                 Learning",
-  year =         "2008",
-}
-
-@InProceedings{CollobertR2008-short,
-  author =       "R. Collobert and J. Weston",
-  booktitle =    "Int. Conf. Mach. Learn. 2008",
-  title =        "A Unified Architecture for Natural Language
-                 Processing: Deep Neural Networks with Multitask
-                 Learning",
-  pages =       "160-167",
-  year =         "2008",
-}
-
-@Article{Comon94,
-  author =       "Pierre Comon",
-  title =        "Independent component analysis - a new concept?",
-  journal =      "Signal Processing",
-  volume =       "36",
-  pages =        "287--314",
-  year =         "1994",
-}
-
-@InProceedings{ConfAI:Grove:linprog,
-  author =       "Adam J. Grove and Dale Schuurmans",
-  booktitle =    "Proceedings of the Fifteenth National Conference on
-                 Artificial Intelligence",
-  title =        "Boosting in the limit: Maximizing the margin of
-                 learned ensembles",
-  year =         "1998",
-}
-
-@InProceedings{ConfAI:Maclin:adaboost,
-  author =       "Richard Maclin and David Opitz",
-  booktitle =    "Proceedings of the Fourteenth National Conference on
-                 Artificial Intelligenc",
-  title =        "An empirical evaluation of Bagging and Boosting",
-  pages =        "546--551",
-  year =         "1997",
-}
-
-@InProceedings{ConfLT:Freund:gametheorie,
-  author =       "Yoav Freund and Robert E. Schapire",
-  booktitle =    "Proceedings of the Ninth Annual Conference on
-                 Computational Learning Theory",
-  title =        "Game theory, on-line prediction and Boosting",
-  pages =        "325--332",
-  year =         "1996",
-}
-
-@InProceedings{ConfML:Dietterich:adaboost+prun,
-  author =       "D. Margineantu and Thomas G. Dietterich",
-  booktitle =    "Machine Learning: Proceedings of Fourteenth
-                 International Conference",
-  title =        "Pruning Adaptive Boosting",
-  publisher =    "ACM",
-  pages =        "211--218",
-  year =         "1997",
-}
-
-@InProceedings{ConfML:Freund:AdaBoostCompar,
-  author =       "Yoav Freund and Robert E. Schapire",
-  booktitle =    "Machine Learning: Proceedings of Thirteenth
-                 International Conference",
-  title =        "Experiments with a new Boosting algorithm",
-  publisher =    "ACM",
-  address =      "USA",
-  pages =        "148--156",
-  year =         "1996",
-}
-
-@InProceedings{ConfML:Freund:margins,
-  author =       "Robert E. Schapire and Yoav Freund and Peter Bartlett
-                 and Wee Sun Lee",
-  booktitle =    "Machine Learning: Proceedings of Fourteenth
-                 International Conference",
-  title =        "Boosting the margin: {A} new explanation for the
-                 effectiveness of voting methods",
-  pages =        "322--330",
-  year =         "1997",
-}
-
-@InProceedings{ConfML:Quinlan:AdaBoost-C45,
-  author =       "J. Ross Quinlan",
-  booktitle =    "Machine Learning: Proceedings of the fourteenth
-                 International Conference",
-  title =        "Bagging, Boosting and {C4.5}",
-  pages =        "725--730",
-  year =         "1996",
-}
-
-@InProceedings{ConfML:Schapire:outputcodes,
-  author =       "Robert E. Schapire",
-  booktitle =    "Machine Learning: Proceedings of the Fourteenth
-                 International Conference",
-  title =        "Using output codes to boost multiclass learning
-                 problems",
-  year =         "1997",
-}
-
-@Article{Coolen88,
-  author =       "A. C. C. Coolen and C. C. A. M. Gielen",
-  title =        "Delays in Neural Networks",
-  journal =      eul,
-  volume =       "7",
-  pages =        "281--285",
-  year =         "1988",
-}
-
-@Book{cooper+meyer-1960,
-  author =       "Grosvenor Cooper And Leonard B. Meyer",
-  title =        "{The Rhythmic Structure of Music}",
-  publisher =    "The Univ. of Chicago Press",
-  address =      "Chicago",
-  year =         "1960",
-  keywords =     "describe, music",
-  origin =       "Kielian-Gilbert",
-  own =          "IU Library",
-}
-
-@InCollection{Cooper73,
-  author =       "L. N. Cooper",
-  editor =       "B. Lundqvist and S. Lundqvist",
-  booktitle =    "Collective Properties of Physical Systems",
-  title =        "A Possible Organization of Animal Memory and
-                 Learning",
-  publisher =    "Academic Press",
-  address =      "New York",
-  pages =        "252--264",
-  year =         "1973",
-}
-
-@InCollection{Cooper87,
-  author =       "C. L. Scofield and D. L. Reilly and C. Elbaum and L.
-                 N. Cooper",
-  booktitle =    "Conference on Neural Information Processing Systems -
-                 Natural and Synthetic",
-  title =        "Pattern class degeneracy in an unrestricted storage
-                 density memory",
-  publisher =    "IEEE",
-  year =         "1987",
-}
-
-@Article{Corana87,
-  author =       "A. Corana and M. Marchesi and C. Martini and S.
-                 Ridella",
-  title =        "Minimizing Multimodal Functions of Continuous
-                 Variables with the Simulated Annealing Algorithm",
-  journal =      acmtms,
-  volume =       "13",
-  number =       "13",
-  pages =        "262--280",
-  month =        sep,
-  year =         "1987",
-  OPTnote =      "",
-}
-
-@Article{Corana87a,
-  author =       "A. Corana and M. Marchesi and C. Martini and S.
-                 Ridella",
-  title =        "Minimizing Multimodal Functions of Continuous
-                 Variables with the Simulated Annealing Algorithm",
-  journal =      acmtms,
-  volume =       "13",
-  number =       "13",
-  pages =        "262--280",
-  month =        sep,
-  year =         "1987",
-}
-
-@Article{Cortes04,
-  author =       "C. Cortes and P. Haffner and M. Mohri",
-  title =        "Rational Kernels: Theory and Algorithms",
-  journal =      jmlr,
-  volume =       "5",
-  pages =        "1035--1062",
-  year =         "2004",
-  OPTnumber =    "",
-}
-
-@Article{Cortes87,
-  author =       "C. Cortes and A. Krogh and J. A. Hertz",
-  title =        "Hierarchical Associative Networks",
-  journal =      jpa,
-  volume =       "20",
-  pages =        "4449--4455",
-  year =         "1987",
-}
-
-@InProceedings{Cortes89,
-  author =       "C. Cortes and J. A. Hertz",
-  booktitle =    ijcnn,
-  title =        "A Network System for Image Segmentation",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "121--127",
-  year =         "1989",
-}
-
-@Article{Cortes95,
-  author =       "Corinna Cortes and Vladimir Vapnik",
-  title =        "Support Vector Networks",
-  journal =      "Machine Learning",
-  volume =       "20",
-  pages =        "273--297",
-  year =         "1995",
-}
-
-@InProceedings{Cortesetal95a,
-  author =       "C. Cortes and H. Drucker and D. Hoover and V. Vapnik",
-  booktitle =    "Proc. 1st Intl. Conf. on Knowledge Discovery and Data
-                 Mining",
-  title =        "Capacity and Complexity Control in Predicting the
-                 Spread Between Borrowing and Lending Interest Rates",
-  address =      "Montreal (Canada)",
-  pages =        "51--56",
-  year =         "1995",
-}
-
-@InProceedings{Cortesetal95b,
-  author =       "C. Cortes and L. D. Jackel and W. P. Chiang",
-  booktitle =    "Proc. 1st Intl. Conf. on Knowledge Discovery and Data
-                 Mining",
-  title =        "Limits on Learning Machine Accuracy Imposed by Data
-                 Quality",
-  address =      "Montreal (Canada)",
-  pages =        "57--62",
-  year =         "1995",
-}
-
-@InProceedings{Cosi-92,
-  author =       "P. Cosi and P. Frasconi and M. Gori and N. Griggio",
-  booktitle =    "Proc. of the International Conference on Spoken
-                 Language",
-  title =        "Phonetic Recognition Experiments with Recurrent Neural
-                 Networks",
-  address =      "Banff (Canada)",
-  pages =        "1335--1338",
-  month =        oct,
-  year =         "1992",
-}
-
-@InProceedings{Cosnard+al-1991,
-  author =       "M. Cosnard and J. C. Mignot and H. Paugam-Moisy",
-  booktitle =    "Proceedings of the Second International Specialist
-                 Seminar on the Design and Application of Parallel
-                 Digital Processors, 1991",
-  title =        "Implementations of Multilayer Neural Networks on
-                 Parallel Architectures",
-  address =      "Lisbon",
-  pages =        "43--47",
-  month =        apr,
-  year =         "1991",
-}
-
-@Article{Cosslett85,
-  author =       "S. R. Cosslett and L-F. Lee",
-  title =        "Serial correlation in discrete variable models",
-  journal =      "Journal of Econometrics",
-  volume =       "27",
-  pages =        "79--97",
-  year =         "1985",
-}
-
-@Article{Cottrell86,
-  author =       "M. Cottrell and J. C. Fort",
-  title =        "A Stochastic Model of Retinotopy: {A} Self Organizing
-                 Process",
-  journal =      biocyb,
-  volume =       "53",
-  pages =        "405--411",
-  year =         "1986",
-}
-
-@InProceedings{Cottrell87,
-  author =       "Garrison W. Cottrell and Paul Munro and David Zipser",
-  booktitle =    "Ninth Annual Conference of the Cognitive Science
-                 Society",
-  title =        "Learning Internal Representations from Gray-Scale
-                 Images: An Example of Extensional Programming",
-  publisher =    "Lawrence Erlbaum, Hillsdale",
-  address =      "Seattle 1987",
-  pages =        "462--473",
-  year =         "1987",
-}
-
-@Book{Courant51,
-  author =       "A. Courant and D. Hilbert",
-  title =        "Methods of Mathematical Physics",
-  publisher =    "Wiley Interscience, New York",
-  year =         "1951",
-}
-
-@Article{Cover65,
-  author =       "T. M. Cover",
-  title =        "Geometrical and Statistical Properties of Systems of
-                 Linear Inequalities with Applications in Pattern
-                 Recognition",
-  journal =      ieeetec,
-  volume =       "14",
-  pages =        "326--334",
-  year =         "1965",
-}
-
-@Article{CoverHart67,
-  author =       "T. M. Cover and P. E. Hart",
-  title =        "Nearest Neighbor Pattern Classification",
-  journal =      "IEEE Transactions on Information Theory",
-  volume =       "13",
-  number =       "1",
-  pages =        "21--27",
-  year =         "1967",
-}
-
-@Article{Cowan88a,
-  author =       "J. D. Cowan and D. H. Sharp",
-  title =        "Neural Nets and Artificial Intelligence",
-  journal =      daed,
-  volume =       "117",
-  pages =        "85--121",
-  year =         "1988",
-}
-
-@Article{Cowan88b,
-  author =       "J. D. Cowan and D. H. Sharp",
-  title =        "Neural Nets",
-  journal =      qrb,
-  volume =       "21",
-  pages =        "365--427",
-  year =         "1988",
-}
-
-@InProceedings{Cox+Bridle89,
-  author =       "S. Cox and J. S. Bridle",
-  booktitle =    "Proc. IEEE Conf. on Acoustics, Speech and Signal
-                 Processing",
-  title =        "Unsupervised speaker adaptation by probabilistic
-                 spectrum fitting",
-  organization = "British Telecom and RSRE",
-  year =         "1989",
-}
-
-@InProceedings{Cox+Bridle90,
-  author =       "S. Cox and J. S. Bridle",
-  booktitle =    "Proc. IEEE Conf. on Acoustics, Speech and Signal
-                 Processing",
-  title =        "Simultaneous Speaker Normalisation and Utterance
-                 labelling Using {Bayesian}/Neural Net Techniques",
-  organization = "British Telecom and RSRE",
-  year =         "1990",
-}
-
-@Book{CoxCox94,
-  author =       "Trevor F. Cox and Micheal {A. A}. Cox",
-  title =        "Multidimensional Scaling",
-  publisher =    "Chapman \& Hall",
-  address =      "London",
-  year =         "1994",
-}
-
-@Book{Cox+Cox-2000,
-  author =       "T. Cox and M. Cox",
-  title =        "Multidimensional Scaling",
-  publisher =    "Chapman \& Hall",
-  edition =      2,
-  address =      "London",
-  year =         "2000",
-}
-
-@InProceedings{Cozman2003,
-  author =       "F. Cozman and I. Cohen and M. Cirelo",
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  title =        "Semi-Supervised Learning of Mixture Models",
-  year =         "2003",
-}
-
-@Article{Cragg54,
-  author =       "B. G. Cragg and H. N. V. Temperley",
-  title =        "The Organization of Neurones: {A} Cooperative
-                 Analogy",
-  journal =      EEGCN,
-  volume =       "6",
-  pages =        "85--92",
-  year =         "1954",
-}
-
-@Article{Cragg55,
-  author =       "B. G. Cragg and H. N. V. Temperley",
-  title =        "Memory: The Analogy with Ferromagnetic Hysteresis",
-  journal =      brain,
-  volume =       "78 II",
-  pages =        "304--316",
-  year =         "1955",
-}
-
-@Article{Craven+Wahba79,
-  author =       "P. Craven and G. Wahba",
-  title =        "Smoothing noisy data with spline functions",
-  journal =      "Numerical Mathematics",
-  volume =       "31",
-  pages =        "377--403",
-  year =         "1979",
-}
-
-@Article{Crick89,
-  author =       "F. Crick",
-  title =        "The Recent Excitement About Neural Networks",
-  journal =      nature,
-  volume =       "337",
-  pages =        "129--132",
-  year =         "1989",
-}
-
-@Article{Crisanti86,
-  author =       "A. Crisanti and D. J. Amit and H. Gutfreund",
-  title =        "Saturation Level of the Hopfield Model for Neural
-                 Network",
-  journal =      eul,
-  volume =       "2",
-  pages =        "337--341",
-  year =         "1986",
-}
-
-@Article{Crisanti87,
-  author =       "A. Crisanti and H. Sompolinsky",
-  title =        "Dynamics of Spin Systems with Randomly Asymmetric
-                 Bonds: Langevin Dynamics and a Spherical Model",
-  journal =      prA,
-  volume =       "36",
-  pages =        "4922--4939",
-  year =         "1987",
-}
-
-@Book{Cristianini+Shawe-Taylor-2000,
-  author =       "Nello Cristianini and John Shawe-Taylor",
-  title =        "An Introduction to Support Vector Machines and other
-                 kernel-based learning methods",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge, UK",
-  year =         "2000",
-}
-
-@InProceedings{Cristianini-2002,
-  author =       "N. Cristianini and J. Shawe-Taylor and J. Kandola",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Spectral Kernel Methods for Clustering",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-}
-
-@InProceedings{Cristianini02,
-  author =       "N. Cristianini and J. Shawe-Taylor and A. Elisseeff
-                 and J. Kandola",
-  title =        "On Kernel-Target Alignment",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  volume =       "14",
-  pages =        "367--373",
-  year =         "2002",
-}
-
-@InProceedings{Cristianini2002,
-  author =       "N. Cristianini and J. Shawe-Taylor and J. Kandola",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Spectral Kernel Methods for Clustering",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-  original =     "orig/AA16.ps",
-}
-
-@Article{cucker+grigoriev99,
-  author =       "Felipe Cucker and Dima Grigoriev",
-  title =        "Complexity Lower Bounds for Approximation Algebraic
-                 Computation Trees",
-  journal =      "Journal of Complexity",
-  volume =       "15",
-  number =       "4",
-  pages =        "499--512",
-  year =         "1999",
-}
-
-@TechReport{Cybenko88,
-  author =       "G. Cybenko",
-  title =        "Continuous Valued Neural Networks with Two Hidden
-                 Layers Are Sufficient",
-  institution =  "Department of Computer Science, Tufts University",
-  address =      "Medford, MA",
-  year =         "1988",
-}
-
-@Article{Cybenko89,
-  author =       "G. Cybenko",
-  title =        "Approximation by Superpositions of a Sigmoidal
-                 Function",
-  journal =      mcss,
-  volume =       "2",
-  pages =        "303--314",
-  year =         "1989",
-}
-
-@InProceedings{Dahmen2000,
-  author =       "J. Dahmen and D. Keysers and M. Pitz and H. Ney",
-  booktitle =    "22nd Symposium of the German Association for Pattern
-                 Recognition",
-  title =        "Structured covariance matrices for statistical image
-                 object recognition",
-  address =      "Kiel, Germany",
-  year =         "2000",
-}
-
-@InProceedings{Dai95,
-  author =       "H. Dai and J. M. Lina and B. Goulard and J. W. Thomson
-                 and C. K. Scott",
-  booktitle =    "1995 Robotic and Knowledge Based Sytems Workshop",
-  title =        "An Expert Diagnostic System Introducing Wavelets
-                 Analysis and Neural Network",
-  address =      "St. Hubert, Canada",
-  pages =        "",
-  year =         "1995",
-}
-
-@InProceedings{darken-moody91,
-  author =       "Christian Darken and John Moody",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "Note on learning rate schedules for stochastic
-                 optimization",
-  publisher =    "Morgan Kaufmann, Palo Alto",
-  address =      "Denver, CO",
-  pages =        "832--838",
-  year =         "1991",
-}
-
-@Article{DarrochJ1972,
-  author =       "J. N. Darroch and D. Ratcliff",
-  title =        "Generalized iterative scaling for log-linear models",
-  journal =      "Annals of Mathematical Statistics",
-  number =       "43",
-  pages =        "1470--1480",
-  year =         "1972",
-}
-
-@InProceedings{Das-nips93,
-  author =       "S. Das and C. L. Giles and G. Z. Sun",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Using Prior Knowledge in an {NNPDA} to Learn
-                 Context-Free Languages",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  year =         "1993",
-}
-
-@InProceedings{Das-nips94,
-  author =       "S. Das and M. C. Mozer",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "A Unified Gradient-Descent/Clustering Architecture for
-                 Finite State Machine Induction",
-  publisher =    "Morgan Kaufmann",
-  year =         "1994",
-}
-
-@Article{daubechies90,
-  author =       "Ingrid Daubechies",
-  title =        "The Wavelet Transform, Time-Frequency Localization and
-                 Signal Analysis",
-  journal =      "IEEE Transaction on Information Theory",
-  volume =       "36",
-  number =       "5",
-  pages =        "961--1005",
-  month =        sep,
-  year =         "1990",
-}
-
-@article{daume09searn,
-  author =       {Hal {Daum\'e III} and John Langford and Daniel Marcu},
-  title =        {Search-based Structured Prediction},
-  year =         {2009},
-  booktitle =    {Machine Learning Journal},
-}
-
-@InProceedings{Davis89,
-  author =       "L. Davis",
-  editor =       "J. D. Schaffer",
-  booktitle =    "Proceedings of the Third International Conference on
-                 Genetic Algorithms",
-  title =        "Mapping neural networks into classifier systems",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Arlington 1989",
-  pages =        "375--378",
-  year =         "1989",
-}
-
-@Article{davis94adaptive,
-  author =       "G. Davis and S. Mallat and Z. Zhang",
-  title =        "Adaptive time-frequency decompositions",
-  journal =      "Optical Engineering",
-  volume =       "33",
-  number =       "7",
-  pages =        "2183--2191",
-  month =        jul,
-  year =         "1994",
-}
-
-@InProceedings{Dayan93,
-  author =       "P. Dayan and G. E. Hinton",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Feudal Reinforcement Learning",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1993",
-}
-
-@Article{Dayan95,
-  author =       "Peter Dayan and Geoffrey E. Hinton and Radford Neal and
-                 Rich Zemel",
-  title =        "The {Helmholtz} machine",
-  journal =      "Neural Computation",
-  volume =       "7",
-  pages =        "889--904",
-  year =         "1995",
-}
-
-@inproceedings{debiecristianini03,
-author = "{de Bie}, T. and Cristianini, N.",
-title = "Convex methods for transduction",
-editor = NIPS16ed,
-booktitle = NIPS16,
-year = 2003,
-}
-
-@article{debiecristianini06,
-author = "{de Bie}, T. and Cristianini, N.",
-title = "Fast {SDP} relaxations of graph cut 
-clustering, transduction, and other combinatorial problems",
-journal = jmlr,
-volume = 7,
-year = 2006,
-}
-
-
-@TechReport{deRidder+Duin-2002,
-    author =       {Dick {de Ridder} and Robert P. W. Duin},
-    title =        {Locally linear embedding for classification},
-    number =       {PH-2002-01},
-    institution =  {Pattern Recognition Group, Dept. of Imaging Science and Technology,
-        Delft University of Technology},
-    address =      {Delft, The Netherlands},
-    year =         2002,
-}
-
-@inproceedings{deRidder+al-2003,
-    author    = {Dick {de Ridder} and Olga Kouropteva and Oleg Okun and Matti Pietik{\"a}inen and Robert P. W. Duin},
-    title     = {Supervised Locally Linear Embedding},
-    booktitle = {ICANN},
-    year      = {2003},
-    pages     = {333-341},
-    ee        = {http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2714&spage=333},
-    bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@InProceedings{debollivier-gallinari-thiria-90,
-  author =       "M. deBollivier and P. Gallinari and S. Thiria",
-  booktitle =    "Proc. of the International Neural Network Conference
-                 90",
-  title =        "Multi-module neural networks for classification",
-  address =      "Paris",
-  pages =        "777--780",
-  year =         "1990",
-}
-
-@Article{Decoste-2002,
-  author =       "Dennis Decoste and Bernhard Sch{\"o}lkopf",
-  title =        "Training invariant support vector machines",
-  journal =      "Machine Learning",
-  volume =       "46",
-  pages =        "161--190",
-  year =         "2002",
-}
-
-@Article{Deerwester90,
-  author =       "S. Deerwester and S. T. Dumais and G. W. Furnas and T.
-                 K. Landauer and R. Harshman",
-  title =        "Indexing by latent semantic analysis",
-  journal =      "Journal of the American Society for Information
-                 Science",
-  volume =       "41",
-  number =       "6",
-  pages =        "391--407",
-  year =         "1990",
-}
-
-@Article{Dehaene87,
-  author =       "S. Dehaene and J.-P. Changeux and J.-P. Nadal",
-  title =        "Neural Networks That Learn Temporal Sequences by
-                 Selection",
-  journal =      PNAS,
-  volume =       "84",
-  pages =        "2727--2731",
-  year =         "1987",
-}
-
-@InProceedings{Delalleau+al-2005-short,
-  author =       "Olivier Delalleau and Yoshua Bengio and Nicolas {Le Roux}",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "Efficient Non-Parametric Function Induction in
-                 Semi-Supervised Learning",
-  pages =        "96--103",
-  year =         "2005",
-}
-
-@InProceedings{DeLaTorreF2006,
-  author =       "Fernando De la Torre Frade and Takeo Kanade",
-  booktitle =    "International Conference on Machine Learning",
-  title =        "Discriminative Cluster Analysis",
-  volume =       "148",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "241--248",
-  month =        jun,
-  year =         "2006",
-}
-
-@Article{Delgutte+Kiang84,
-  author =       "B. Delgutte and N. Y. S. Kiang",
-  title =        "Speech coding in the auditory nerve",
-  journal =      jasa,
-  volume =       "75",
-  number =       "3",
-  pages =        "866--907",
-  year =         "1984",
-}
-
-@Article{Delgutte80,
-  author =       "B. Delgutte",
-  title =        "Representation of speech-like sounds in the discharge
-                 patterns of auditory nerve fibers",
-  journal =      jasa,
-  volume =       "68",
-  number =       "3",
-  pages =        "843--857",
-  year =         "1980",
-}
-
-@Misc{delve,
-  author =       "C. Rasmussen and R. Neal and G. E. Hinton and D. van
-                 Camp and Z. Ghahramani and R. Kustra and R.
-                 Tibshirani",
-  title =        "The {DELVE} Manual",
-  year =         "1996",
-  note =         "{DELVE} can be found at
-                 http://www.cs.toronto.edu/\%7Edelve",
-}
-
-@InProceedings{DeMers+Cottrell93,
-  author =       "David DeMers and Garrison W. Cottrell",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Non-linear dimensionality reduction",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "580--587",
-  year =         "1993",
-}
-
-@InProceedings{Demichelis89,
-  author =       "P. DeMichelis and L. Fissore and P. Laface and G.
-                 Micca and E. Piccolo",
-  booktitle =    icassp,
-  title =        "On the Use of Neural Networks for Speaker Independent
-                 Isolated Word Recognition",
-  address =      "Glaskow (Scotland)",
-  year =         "1989",
-}
-
-@InProceedings{DeMori+Palakal85,
-  author =       "R. De Mori and M. Palakal",
-  booktitle =    "Proc. Ninth International Joint Conference on
-                 Artificial Intelligence",
-  title =        "On the use of taxonomy of time-frequency morphologies
-                 for automatic speech recognition",
-  address =      "Los Angeles, CA",
-  pages =        "877--879",
-  year =         "1985",
-}
-
-@Article{DeMori85,
-  author =       "R. De Mori and P. Laface and Y. Mong",
-  title =        "Parallel algorithms for syllable recognition in
-                 continuous speech",
-  journal =      ieeetpami,
-  volume =       "7",
-  pages =        "56--69",
-  year =         "1985",
-}
-
-@Article{DeMori87,
-  author =       "R. De Mori and L. Lam and M. Gilloux",
-  title =        "Learning and plan refinement in a knowledge-based
-                 system for automatic speech recognition",
-  journal =      ieeetpami,
-  volume =       "2",
-  pages =        "289--305",
-  year =         "1987",
-}
-
-@InCollection{DeMori96,
-  author =       "R. {De Mori} and F. Brugnara",
-  editor =       "R. A. Cole and J. Mariani and H. Uszkoriet and A.
-                 Zaenen and V. Zue",
-  booktitle =    "Survey of the State of the Art in Human Language
-                 Technology",
-  title =        "{HMM} Methods in Speech Recognition",
-  publisher =    "Cambridge University Press",
-  address =      "http://www.cse.ogi.edu/CSLU/HLTsurvey/HLTsurvey.html",
-  pages =        "24--34",
-  year =         "1996",
-}
-
-@Article{Dempster77,
-  author =       "A. P. Dempster and N. M. Laird and D. B. Rubin",
-  title =        "Maximum-likelihood from incomplete data via the {EM}
-                 algorithm",
-  journal =      "Journal of Royal Statistical Society B",
-  volume =       "39",
-  pages =        "1--38",
-  year =         "1977",
-}
-
-@InProceedings{denker-lecun-93,
-  author =       "Yann {LeCun} and John S. Denker",
-  booktitle =    "IEEE Workshop on the Physics of Computation",
-  title =        "Natural versus Universal Probability Complexity, and
-                 Entropy",
-  publisher =    "IEEE",
-  pages =        "122--127",
-  year =         "1992",
-}
-
-@InProceedings{Denker86,
-  author =       "J. Denker",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "Neural Network Refinements and Extensions",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "121--128",
-  year =         "1986",
-}
-
-@Article{Denker87,
-  author =       "J. Denker and D. Schwartz and B. Wittner and S. Solla
-                 and R. Howard and L. Jackel and J. Hopfield",
-  title =        "Large Automatic Learning, Rule Extraction, and
-                 Generalization",
-  journal =      cs,
-  volume =       "1",
-  pages =        "877--922",
-  year =         "1987",
-}
-
-@InProceedings{Denker91,
-  author =       "J. S. Denker and Y. {LeCun}",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "Transforming neural-net output levels to probability
-                 distributions",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "853--859",
-  year =         "1991",
-}
-
-@InProceedings{Denker94,
-  author =       "J. Denker and C. J. C. Burges",
-  booktitle =    "The Mathematics of Generalization: Proceedings of the
-                 SFI/CNLS Workshop on Formal Approaches to Supervised
-                 Learning",
-  title =        "Image Segmentation and Recognition",
-  publisher =    "Addison Wesley, ISBN 0-201-40985-2",
-  year =         "1994",
-}
-
-@Article{Deprit89,
-  author =       "E. Deprit",
-  title =        "Implementing Recurrent Back-Propagation on the
-                 Connection Machine",
-  journal =      "Neural Networks",
-  volume =       "2",
-  number =       "4",
-  pages =        "295--314",
-  year =         "1989",
-}
-
-@ARTICLE{Derenyi94,
-   author = {{Der{\'e}nyi}, I. and {Geszti}, T. and {Gy{\"o}rgyi}, G.},
-    title = "{Generalization in the programed teaching of a perceptron}",
-  journal = {Physical Review {E}},
-     year = 1994,
-    month = "October",
-   volume = 50,
-    pages = {3192-3200},
-      doi = {10.1103/PhysRevE.50.3192},
-   adsurl = {http://adsabs.harvard.edu/abs/1994PhRvE..50.3192D},
-  adsnote = {Provided by the SAO/NASA Astrophysics Data System}
-}
-
-@Article{Derrida87,
-  author =       "B. Derrida and E. Gardner and A. Zippelius",
-  title =        "An Exactly Soluble Asymmetric Neural Network Model",
-  journal =      eul,
-  volume =       "4",
-  pages =        "167--173",
-  year =         "1987",
-}
-
-@TechReport{Derthick84,
-  author =       "M. Derthick",
-  title =        "Variations on the {Boltzmann} Machine",
-  number =       "CMU--CS--84--120",
-  institution =  "Department of Computer Science, Carnegie Mellon
-                 University",
-  address =      "Pittsburgh, PA",
-  year =         "1984",
-}
-
-@inproceedings{deSaV93,
-	address = {San Francisco, CA},
-	author = {de Sa, Virginia  R. },
-        editor = NIPS5ed,
-        booktitle = NIPS5,
-	citeulike-article-id = {350518},
-	keywords = {multiview, semisupervised},
-	pages = {112--119},
-	posted-at = {2008-08-12 16:46:39},
-	priority = {2},
-	publisher = {Morgan Kaufmann Publishers},
-	title = {Learning Classification with Unlabeled Data},
-	year = {1993}
-}	
-	%url = {http://citeseer.ist.psu.edu/desa94learning.html},
-
-@InProceedings{DeSieno88,
-  author =       "D. DeSieno",
-  booktitle =    icnn,
-  title =        "Adding a Conscience to Competitive Learning",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "117--124",
-  year =         "1988",
-}
-
-@InProceedings{DeSilva+Tenenbaum-2003,
-  author =       "V. {de Silva} and J. B. Tenenbaum",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Global Versus Local Methods in Nonlinear
-                 Dimensionality Reduction",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =        "705--712",
-  year =         "2003",
-}
-
-@Book{Devaney89,
-  author =       "R. L. Devaney",
-  title =        "An Introduction to Chaotic Dynamical Systems",
-  publisher =    "Addison-Wesley",
-  year =         "1989",
-}
-
-@Article{Devereux84,
-  author =       "J. Devereux and P. Haeberli and O. Smithies",
-  title =        "A comprehensive set of sequence analysis programs for
-                 the {VAX}",
-  journal =      "Nucleic Acids Research",
-  volume =       "12",
-  pages =        "387--395",
-  year =         "1984",
-}
-
-@Book{Devijver82,
-  author =       "P. A. Devijver and J. Kittler",
-  title =        "Pattern Recognition: {A} Statistical Approach",
-  publisher =    "Prentice-Hall",
-  address =      "London",
-  year =         "1982",
-}
-
-@Article{Devijver87,
-  author =       "J. Voisin and P. A. Devijver",
-  title =        "An application of the multiedit-condensing technique
-                 to the reference selection problem in a print
-                 recognition system",
-  journal =      "Pattern Recognition",
-  volume =       "20",
-  number =       "5",
-  pages =        "465--474",
-  year =         "1987",
-}
-
-@Article{deVries92,
-  author =       "B. \mbox{de Vries} and J. C. Principe",
-  title =        "The gamma model -- {A} new neural net model for
-                 temporal processing",
-  journal =      nn,
-  volume =       "5",
-  pages =        "565--576",
-  year =         "1992",
-}
-
-@Book{Devroye-book96,
-  author =       "L. Devroye and L. Gyröfi and G. Lugosi",
-  title =        "A Probabilistic Theory of Pattern Recognition",
-  publisher =    "Springer-Verlag",
-  year =         "1996",
-}
-
-@Article{Devroye88,
-  author =       "Luc Devroye",
-  title =        "Automatic Pattern Recognition: {A} Study of the
-                 Probability of Error",
-  journal =      "IEEE Transactions on Pattern Analysis and Machine
-                 Intelligence",
-  volume =       "10",
-  number =       "4",
-  pages =        "530--543",
-  month =        jul,
-  year =         "1988",
-}
-
-@Book{Diamantras-96,
-  author =       "K. I. Diamantras and S. Y. Kung",
-  title =        "Principal Component Neural Networks: theory and applications",
-  publisher =    "Wiley",
-  year =         "1996",
-}
-
-@Article{Diebold+Mariano95,
-  author =       "F. X. Diebold and R. S. Mariano",
-  title =        "Comparing Predictive Accuracy",
-  journal =      "Journal of Business and Economic Statistics",
-  volume =       "13",
-  number =       "3",
-  pages =        "253--263",
-  year =         "1995",
-}
-
-@InCollection{Diebold93,
-  author =       "F. X. Diebold and J. H. Lee and G. C. Weinbach",
-  editor =       "C. Hargreaves",
-  booktitle =    "Nonstationary Time Series Analysis and Cointegration",
-  title =        "Regime switching with time-varying transition
-                 probabilities",
-  publisher =    "Oxford University Press",
-  address =      "Oxford",
-  year =         "1993",
-}
-
-@InCollection{Diebold93b,
-  author =       "F. X. Diebold and G. Rudebusch and E. Sichel",
-  editor =       "J. H. Stock and M. W. Watson",
-  booktitle =    "Business Cycles, Indicators, and Forecasting",
-  title =        "Further evidence on business-cycle duration
-                 dependence",
-  publisher =    "University of Chicago Press",
-  address =      "Chicago",
-  year =         "1993",
-}
-
-@Article{DieboldKilian,
-  author =       "F. X. Diebold and L. Kilian",
-  title =        "Measuring Predictability:Theory and Macroeconomics
-                 Applications",
-  journal =      "NBER technical working paper",
-  volume =       "213",
-  year =         "1997",
-}
-
-@InCollection{DieboldLopez,
-  author =       "F. X. Diebold and J. A. Lopez",
-  editor =       "G. S. Maddala and C. R. Rao",
-  booktitle =    "Handbook of Statistics, Vol. 14",
-  title =        "Forecast Evaluation and Combination",
-  publisher =    "Elsevier Science",
-  pages =        "241--268",
-  year =         "1996",
-}
-
-@Article{Diederich87,
-  author =       "S. Diederich and M. Opper",
-  title =        "Learning of Correlated Patterns in Spin-Glass Networks
-                 by Local Learning Rules",
-  journal =      prl,
-  volume =       "58",
-  pages =        "949--952",
-  year =         "1987",
-}
-
-@InProceedings{Diegert90,
-  author =       "C. Diegert",
-  booktitle =    "Proceedings of IEEE-IJCNN90",
-  title =        "Out-of-core Backpropagation",
-  volume =       "II",
-  address =      "San Diego, CA",
-  pages =        "97--103",
-  year =         "1990",
-}
-
-@Article{dietterich,
-  author =       "T. G. Dietterich",
-  title =        "Approximate Statistical Tests for Comparing Supervised
-                 Classification Learning Algorithms",
-  journal =      "Neural Computation",
-  volume =       "10",
-  number =       "7",
-  pages =        "1895--1924",
-  year =         "1998",
-}
-
-@Article{Dietterich1998,
-  author =       "Thomas G. Dietterich",
-  title =        "Approximate Statistical Test For Comparing Supervised
-                 Classification Learning Algorithms",
-  journal =      "Neural Computation",
-  volume =       "10",
-  number =       "7",
-  pages =        "1895--1923",
-  year =         "1998",
-  URL =          "citeseer.ist.psu.edu/dietterich98approximate.html",
-}
-
-@Article{dietterich97,
-  author =       "Thomas G. Dietterich and Richard H. Lathrop and Tomas
-                 Lozano-Perez",
-  title =        "Solving the Multiple Instance Problem with
-                 Axis-Parallel Rectangles",
-  journal =      "Artificial Intelligence",
-  volume =       "89",
-  number =       "1-2",
-  pages =        "31--71",
-  year =         "1997",
-}
-
-
-@Article{Diggle+Gratton-1984,
-  author =       "P. Diggle and R. Gratton",
-  title =        "Monte Carlo Methods of Inference for Implicit Statistical Models",
-  journal =      "Journal of the Royal Statistical Society. Series B (Methodological)",
-  volume =       "46",
-  number =       "2",
-  pages =        "193--227",
-  year =         "1984",
-  publisher =    "Blackwell Publishing for the Royal Statistical Society",
-}
-
-
-@InCollection{Doi-2006,
-  author =       "Eizaburo Doi and Doru C. Balcan and Michael S.
-                 Lewicki",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "A Theoretical Analysis of Robust Coding over Noisy
-                 Overcomplete Channels",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "307--314",
-  year =         "2006",
-}
-
-@InProceedings{DoiE2007,
-  author =       "Eizaburo Doi and Michael S. Lewicki",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "A Theory of Retinal Population Coding.",
-  publisher =    "MIT Press",
-  pages =        "353--360",
-  year =         "2007",
-}
-
-@book{Doidge-2007,
-    author = {Doidge, Norman},
-    howpublished = {Paperback},
-    isbn = {0143113100},
-    month = {December},
-    publisher = {Penguin Group},
-    title = {The Brain That Changes Itself: Stories of Personal Triumph from the Frontiers of Brain Science},
-    year = {2007}
-}
-
-@InCollection{DollarP2007,
-  author =       "Piotr Doll\'ar and Serge Belongie and Vincent Rabaud",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Learning to Traverse Image Manifolds",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "361--368",
-  year =         "2007",
-}
-
-@inproceedings{ DollarP2007b,
-       author = "P. Doll\'ar and V. Rabaud and S. Belongie",
-       title = "Non-Isometric Manifold Learning: Analysis and an Algorithm",
-       booktitle =    ICML07,
-       editor =       ICML07ed,
-       publisher =    ICML07publ,
-       month = "June",
-       year = "2007"
-}
-
-@TechReport{Donoho+Carrie-03,
-  author =       "D. L. Donoho and C. Grimes",
-  title =        "Hessian Eigenmaps: new locally linear embedding
-                 techniques for high-dimensional data",
-  number =       "2003-08",
-  institution =  "Dept. Statistics, Stanford University",
-  year =         "2003",
-}
-
-@article{Donoho-2006,
- author = {David Donoho},
- title = {Compressed sensing},
- journal = {{IEEE} Transactions on Information Theory},
- volume = 52,
- number = 4,
- pages = {1289--1306},
- year = 2006,
-}
-
-@Book{Dorigo98,
-  author =       "M. Dorigo and M. Colombetti",
-  title =        "Robot shaping: {An} experiment in behavior
-                 engineering",
-  publisher =    "MIT Press/Bradford Books",
-  year =         "1998",
-}
-
-@book{Doucet+al-2001,
-  editor =       "A. Doucet and  N. {de Freitas} and N. Gordon",
-  title =        "Sequential Monte Carlo Methods in Practice",
-  publisher =    "Springer-Verlag",
-  year =         "2001",
-}
-
-@TechReport{Doya93bif,
-  author =       "K. Doya",
-  title =        "Bifurcations of Recurrent Neural Networks in Gradient
-                 Learning",
-  institution =  "Department of Biology, University of California",
-  address =      "La Jolla, CA",
-  year =         "1993",
-  note =         "Submitted",
-}
-
-@TechReport{Doya93un,
-  author =       "K. Doya",
-  title =        "Universality of Fully-Connected Recurrent Neural
-                 Networks",
-  institution =  "Department of Biology, University of California",
-  address =      "La Jolla, CA",
-  year =         "1993",
-  note =         "Submitted",
-}
-
-@Article{Doyle+Snell-1984,
-  author =       "Peter G. Doyle and J. Laurie Snell",
-  title =        "Random Walks and Electric Networks",
-  journal =      "Mathematical Association of America",
-  year =         "1984",
-}
-
-@Book{Draper81,
-  author =       "N. R. Draper and H. Smith",
-  title =        "Applied Regression Analysis",
-  publisher =    "John Wiley and Sons",
-  year =         "1981",
-}
-
-@InProceedings{Driancourt91,
-  author =       "X. Driancourt and L. Bottou and P. Gallinari",
-  booktitle =    ijcnn,
-  title =        "Learning Vector Quantization, Multi-Layer Perceptron
-                 and Dynamic Programming: Comparison and Cooperation",
-  volume =       "2",
-  pages =        "815--819",
-  year =         "1991",
-  OPTaddress =   "Seattle WA",
-}
-
-@InProceedings{Drucker93,
-  author =       "H. Drucker and R. Schapire and R. Simard",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Improving performance in neural networks using a
-                 boosting algorithm",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "42--49",
-  year =         "1993",
-}
-
-@Article{Drucker93b,
-  author =       "H. Drucker and R. Schapire and R. Simard",
-  title =        "Boosting performance in neural networks",
-  journal =      "International Journal of Pattern Recognition and
-                 Artificial Intelligence",
-  pages =        "61--76",
-  year =         "1993",
-  note =         "Special Issue on Applications of Neural Networks to
-                 Pattern Recognition (I. Guyon Ed.)",
-}
-
-@article{Duane-1987,
- author = {S. Duane and A.D. Kennedy and B. Pendleton and D. Roweth},
- title = {Hybrid {M}onte {C}arlo},
- journal = {Phys. Lett. {B}},
- volume = 195,
- pages = {216--222},
- year = 1987,
-}
-
-@Book{Duda-Hart,
-  author =       "R. O. Duda and P. E. Hart",
-  title =        "Pattern Classification and Scene Analysis",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1973",
-}
-
-@Book{Duda-Hart-2000,
-  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
-  title =        "Pattern Classification, Second Edition",
-  publisher =    "Wiley and Sons",
-  address =      "New York",
-  year =         "2001",
-}
-
-@Book{Duda73,
-  author =       "R. O. Duda and P. E. Hart",
-  title =        "Pattern Classification and Scene Analysis",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1973",
-}
-
-@Article{Dugas+al-2003,
-  author =       "C. Dugas and Y. Bengio and N. Chapados and P. Vincent
-                 and G. Denoncourt and C. Fournier",
-  title =        "Statistical Learning Algorithms Applied to Automobile
-                 Insurance Ratemaking",
-  journal =      "CAS Forum",
-  volume =       "1",
-  number =       "1",
-  pages =        "179--214",
-  month =        "Winter",
-  year =         "2003",
-}
-
-@TechReport{Dugas00,
-  author =       "C. Dugas and O. Bardou and Y. Bengio",
-  title =        "Analyses Empiriques sur des Transactions d'options",
-  number =       "1176",
-  institution =  "D\'epartment d'informatique et de Recherche
-                 Op\'erationnelle, Universit\'e de Montr\'eal",
-  address =      "Montr\'eal, Qu\'ebec, Canada",
-  year =         "2000",
-}
-
-@InProceedings{Dugas01,
-  author =       "C. Dugas and Y. Bengio and F. B\'elisle and C.
-                 Nadeau",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "Incorporating Second-Order Functional Knowledge for Better Option Pricing",
-  publisher =    "{MIT} Press",
-  pages =        "472--478",
-  year =         "2001",
-}
-
-%%InProceedings{Bengio2000,
-%%  author =       "Y. Bengio",
-%%  booktitle =    icjnn
-%%  title =        "Incorporating Second-Order Functional Knowledge for Better Option Pricing",
-%%  volume =       "V",
-%%  pages =        "79--84",
-%%  year =         "2000",
-%%}
-
-@inproceedings{Bengio2000,
-  title={Probabilistic neural network models for sequential data},
-  author={Bengio, Y.},
-  booktitle=ijcnn,
-  year={2000},
-  volume={5},
-  pages={79-84},
-  abstract={Artificial neural networks (ANN) can be incorporated into probabilistic models. In this paper we review some of the approaches which have been proposed to incorporate them into probabilistic models of sequential data, such as hidden Markov models (HMM). We also discuss new developments and new ideas in this area, in particular how ANN can be used to model high-dimensional discrete and continuous data to deal with the curse of dimensionality and how the ideas proposed in these models could be applied to statistical language modeling to represent longer-term context than allowed by trigram models, while keeping word-order information},
-  keywords={computational linguistics, hidden Markov models, neural nets, probabilityANN, HMM, hidden Markov models, longer-term context, probabilistic models, probabilistic neural network models, sequential data, statistical language modeling, trigram models, word-order information},
-  doi={10.1109/IJCNN.2000.861438},
-}
-
-@InProceedings{Bengio-hyper-2000,
-  author =       "Yoshua Bengio",
-  booktitle =    ijcnn,
-  title =        "Continuous Optimization of Hyper-Parameters",
-  volume =       "V",
-  pages =        "305--310",
-  year =         "2000",
-}
-
-@InProceedings{Ghosn2000,
-  author =       "J. Ghosn and Y. Bengio",
-  booktitle =    ijcnn,
-  title =        "Bias Learning, Knowledge Sharing",
-  volume =       "I",
-  pages =        "9--14",
-  year =         "2000",
-}
-
-@Article{Durbin87,
-  author =       "R. Durbin and D. Willshaw",
-  title =        "An Analogue Approach to the Travelling Salesman
-                 Problem Using an Elastic Net Method",
-  journal =      nature,
-  volume =       "326",
-  pages =        "689--691",
-  year =         "1987",
-}
-
-@MastersThesis{Dzwonczyk91,
-  author =       "M. Dzwonczyk",
-  title =        "Quantitative failure models of feed-forward neural
-                 networks",
-  school =       "MIT",
-  year =         "1991",
-}
-
-@Book{econometric-G-97,
-  author =       "W. H. Greene",
-  title =        "Econometric Analysis 3rd edition",
-  publisher =    "Prentice Hall, Inc.",
-  year =         "1997",
-}
-
-@Article{efficient-KW-82,
-  author =       "W. W. Krasker and R. R. Welsch",
-  title =        "Efficient Bounded-Influence Regression Estimation",
-  journal =      "J. Am. Stat. Asso.",
-  volume =       "77",
-  pages =        "595--604",
-  year =         "1982",
-}
-
-@Book{Efron+Tibs93,
-  author =       "Bradley Efron and Robert J. Tibshirani",
-  title =        "An introduction to the Bootstrap",
-  publisher =    "Chapman and Hall",
-  address =      "New York",
-  year =         "1993",
-}
-
-@TechReport{eigen-TR2,
-  author =       "Yoshua Bengio and Pascal Vincent and Jean-Fran{\cc}ois
-                 Paiement and Olivier Delalleau and Marie Ouimet and
-                 Nicolas {Le Roux}",
-  title =        "Spectral Clustering and Kernel {PCA} are Learning
-                 Eigenfunctions",
-  number =       "1239",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "2003",
-}
-
-@InProceedings{Eisner96,
-  author =       "J. Eisner",
-  booktitle =    "COLING-96",
-  title =        "Three new probabilistic models for dependency parsing:
-                 an exploration",
-  address =      "Copenhagen, Denmark",
-  pages =        "340--345",
-  year =         "1996",
-}
-
-@Article{EladAharon2006,
-  author =       "Michael Elad and Michal Aharon",
-  title =        "Image Denoising Via Sparse and Redundant
-                 Representations Over Learned Dictionaries",
-  journal =      "IEEE Transactions on Image Processing",
-  volume =       "15",
-  number =       "12",
-  pages =        "3736--3745",
-  month =        dec,
-  year =         "2006",
-  bibsource =    "http://www.visionbib.com/bibliography/image-proc131.html#TT8737",
-}
-
-@InProceedings{ElHihi+Bengio-nips8-small,
-  author =       "S. ElHihi and Y. Bengio",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Hierarchical Recurrent Neural Networks for Long-Term
-                 Dependencies",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "493--499",
-  year =         "1996",
-}
-
-@InProceedings{ellis+poliner-icassp2007,
-  author =       "D. Ellis and G. Poliner",
-  editor =       "",
-  booktitle =    "{Proceedings of the 2007 International Conference on
-                 Acoustics, Speech and Signal Processing (ICASSP)}",
-  title =        "Identifying Cover Songs with Chroma Features and
-                 Dynamic Programming",
-  publisher =    "IEEE Signal Processing Society",
-  pages =        "",
-  year =         "2007",
-}
-
-@Article{Elman88,
-  author =       "J. L. Elman and D. Zipser",
-  title =        "Learning the Hidden Structure of Speech",
-  journal =      jasa,
-  volume =       "83",
-  pages =        "1615--1626",
-  year =         "1988",
-}
-
-@Article{Elman88Jasa88,
-  author =       "J. L. Elman and D. Zipser",
-  title =        "Learning the Hidden Structure of Speech",
-  journal =      "Journal of the Acoustical Society of America",
-  volume =       "83",
-  year =         "1988",
-}
-
-@Article{Elman90,
-  author =       "J. L. Elman",
-  title =        "Finding Structure in Time",
-  journal =      "Cognitive Science",
-  volume =       "14",
-  pages =        "179--211",
-  year =         "1990",
-}
-
-@Article{Elman93,
-  author =       "Jeffrey L. Elman",
-  title =        "Learning and development in neural networks: {The}
-                 importance of starting small.",
-  journal =      "Cognition",
-  volume =       "48",
-  pages =        "781--799",
-  year =         "1993",
-  url =          "http://www3.isrl.uiuc.edu/~junwang4/langev/localcopy/pdf/elman93cognition.pdf"
-}
-
-@TechReport{ElmanTR88,
-  author =       "J. L. Elman",
-  title =        "Finding Structure in Time",
-  number =       "CRL TR 8801",
-  institution =  "Center for Research in Language, University of
-                 California at San Diego",
-  year =         "1988",
-}
-
-@TechReport{EM-tech-rep,
-  author =       "Y. Bengio and P. Frasconi",
-  title =        "Learning Sequential Behavior: an {EM} Approach",
-  institution =  "Universit\`a di Firenze",
-  year =         "1994",
-  note =         "(in preparation)",
-}
-
-@Article{Engel-Mannor-Meir-2003,
-  author =       "Y. Engel and S. Mannor and R. Meir",
-  title =        "The kernel recursive least squares algorithm",
-  journal =      "IEEE Trans. Sig. Proc.",
-  volume =       "52",
-  number =       "8",
-  pages =        "2275--2285",
-  year =         "2004",
-}
-
-@Article{erhan06qsar,
-  author =       "Dumitru Erhan and Pierre-Jean L'Heureux and Shi Yi Yue
-                 and Yoshua Bengio",
-  title =        "Collaborative Filtering on a Family of Biological
-                 Targets.",
-  journal =      "Journal of Chemical Information and Modeling",
-  volume =       "46",
-  number =       "2",
-  pages =        "626--635",
-  year =         "2006",
-}
-
-@techreport{Erhan-09-visualization-tr,
-  author = {Dumitru Erhan and Yoshua Bengio and Aaron Courville and Pascal Vincent},
-  title = "Visualizing Higher-Layer Features of a Deep Network",
-  institution = "Universit\'{e} de Montr\'{e}al",
-  number = "1341",
-  year = 2009,
-}
-
-@inproceedings{Erhan2009-small,
- author = {Dumitru Erhan and Pierre-Antoine Manzagol and Yoshua Bengio and Samy Bengio and Pascal Vincent},
-  booktitle =    "Proceedings of AISTATS'2009",
-  title =        "The Difficulty of Training Deep Architectures and the
-Effect of Unsupervised Pre-Training",
-  year = 2009,
-}
-
-@inproceedings{Erhan2009-short,
- author = {D. Erhan and P.-A. Manzagol and Y. Bengio and S. Bengio and P. Vincent},
-  booktitle =    "AI \& Stat.'2009",
-  title =        "The Difficulty of Training Deep Architectures and the
-Effect of Unsupervised Pre-Training",
-  year = 2009,
-}
-
-@Book{EverittB1981,
-  author = 	 {B. S. Everitt and D. J. Hand},
-  title = 	 {Finite Mixture Distributions},
-  publisher =    {Chapman and Hall},
-  address =      {London},
-  year = 	 {1981},
-  series = 	 {Monographs on Statistics and Applied Probability},
-}
-
-@InProceedings{evgeniou04,
-  author =       "Theodoros Evgeniou and Massimiliano Pontil",
-  booktitle =    "KDD '04: Proceedings of the 2004 ACM SIGKDD
-                 international conference on Knowledge discovery and
-                 data mining",
-  title =        "Regularized multi--task learning",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "109--117",
-  year =         "2004",
-  location =     "Seattle, WA, USA",
-}
-
-@Article{evgeniou05,
-  author =       "Theodoros Evgeniou and Charles A. Micchelli and
-                 Massimiliano Pontil",
-  title =        "Learning Multiple Tasks with Kernel Methods",
-  journal =      jmlr,
-  volume =       "6",
-  pages =        "615--637",
-  month =        apr,
-  year =         "2005",
-}
-
-@InProceedings{Fahlman83,
-  author =       "S. E. Fahlman and G. E. Hinton and T. J. Sejnowski",
-  booktitle =    "Proceedings of the National Conference on Artificial
-                 Intelligence AAAI-83",
-  title =        "Massively parallel architectures for {AI}: {NETL},
-                 Thistle, and {Boltzmann} machines",
-  year =         "1983",
-}
-
-@InProceedings{Fahlman89,
-  author =       "S. E. Fahlman",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "Fast-Learning Variations on Back-Propagation: An
-                 Empirical Study",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "38--51",
-  year =         "1989",
-}
-
-@InProceedings{Fahlman90,
-  author =       "Scott E. Fahlman and Christian Lebiere",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "The Cascade-Correlation Learning Architecture",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "524--532",
-  year =         "1990",
-}
-
-@InProceedings{Fahlman90-small,
-  author =       "S. E. Fahlman and C. Lebiere",
-  booktitle =    "NIPS 2",
-  title =        "The Cascade-Correlation Learning Architecture",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "524--532",
-  year =         "1990",
-}
-
-@Article{Fama+French,
-  author =       "E. F. Fama and K. R. French",
-  title =        "Permanent and Temporary Components of Stock Prices",
-  journal =      "Journal of Political Economy",
-  volume =       "96",
-  number =       "2",
-  pages =        "246--273",
-  year =         "1988",
-}
-
-@Book{Fant60,
-  author =       "G. Fant",
-  title =        "Acoustic Theory of Speech Production",
-  publisher =    "Mouton and Co.",
-  year =         "1960",
-}
-
-@Book{Fant73,
-  author =       "G. Fant",
-  title =        "Speech Sounds and Features",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1973",
-}
-
-@Article{Farhat85,
-  author =       "N. H. Farhat and D. Psaltis and A. Prata and E. Paek",
-  title =        "Optical Implementation of the Hopfield Model",
-  journal =      applopt,
-  volume =       "24",
-  year =         "1985",
-}
-
-@Article{Farhat87,
-  author =       "N. H. Farhat",
-  title =        "Optoelectronic Analogs of Self-Programming Neural
-                 Nets: Architectures and Methods for Implementing Fast
-                 Stochastic Learning by Simulated Annealing",
-  journal =      applopt,
-  volume =       "26",
-  pages =        "5093--5103",
-  year =         "1987",
-}
-
-@Article{Farmer87,
-  author =       "D. Farmer and J. Sidorowich",
-  title =        "Predicting Chaotic Time Series",
-  journal =      prl,
-  volume =       "59",
-  pages =        "845--848",
-  year =         "1987",
-}
-
-@InCollection{Farmer88,
-  author =       "D. Farmer and J. Sidorowich",
-  editor =       "W. C. Lee",
-  booktitle =    "Evolution, Learning, and Cognition",
-  title =        "Exploiting Chaos to Predict the Future and Reduce
-                 Noise",
-  publisher =    "World Scientific",
-  address =      "Singapore",
-  pages =        "277--330",
-  year =         "1988",
-}
-
-@inproceedings{Fei-Fei.2004,
-        author = {Fei-Fei, Li and Fergus, Rod and Perona, Pietro},
-        doi = {10.1109/CVPR.2004.109},
-        journal = {Computer Vision and Pattern Recognition Workshop, 2004 Conference on},
-        keywords = {categorization, computer-vision, generative-models},
-        pages = {178},
-        posted-at = {2007-08-10 12:20:22},
-        priority = {3},
-        title = {Learning Generative Visual Models from Few Training Examples: An Incremental Bayesian Approach Tested on 101 Object Categories},
-        url = {http://dx.doi.org/10.1109/CVPR.2004.109},
-        year = {2004}
-}
-
-@Article{Feldman82,
-  author =       "J. A. Feldman and D. H. Ballard",
-  title =        "Connectionist Models and Their Properties",
-  journal =      cogsci,
-  volume =       "6",
-  year =         "1982",
-}
-
-@Article{feldman96,
-  author =       "Jerome A. Feldman and George Lakoff and David Bailey
-                 and Srini Narayanan and Terry Regier and Andreas
-                 Stolcke",
-  title =        "{L0} - The First Five Years of an Automated Language
-                 Acquisition Project",
-  journal =      "Artificial Intelligence Review",
-  volume =       "10",
-  number =       "1-2",
-  pages =        "103--129",
-  year =         "1996",
-  URL =          "citeseer.ist.psu.edu/feldman96first.html",
-}
-
-@Book{Fellbaum1996,
-  author =       "Christine Fellbaum",
-  title =        "{WordNet}: An Electronic Lexical Database and Some of
-                 its Application",
-  publisher =    "MIT Press",
-  year =         "1996",
-}
-
-@Misc{Fellbaum1998,
-  author =       "Christiane Fellbaum Editor",
-  title =        "{WordNet}: An Electronic Lexical Database",
-  URL =          "citeseer.nj.nec.com/fellbaum98wordnet.html",
-}
-
-@Book{Feller68,
-  author =       "W. Feller",
-  title =        "An Introduction to Probability Theory and Its
-                 Applications",
-  volume =       "1",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1968",
-}
-
-@InProceedings{Feng-Statlog,
-  author =       "C. Feng and A. Sutherland and R. King and S. Muggleton
-                 and R. Henery",
-  booktitle =    "Proceedings of the Fourth International Workshop on
-                 Artificial Intelligence and Statistics",
-  title =        "Comparison of machine learning classifiers to
-                 statistics and neural networks",
-  pages =        "41--52",
-  year =         "1993",
-}
-
-@article{Field-1994,
-    author = {David J. Field},
-    title = {What is the goal of sensory coding?},
-    journal = {Neural Computation},
-    volume = {6},
-    number = {4},
-    year = {1994},
-    issn = {0899-7667},
-    pages = {559--601},
-    doi = {http://dx.doi.org/10.1162/neco.1994.6.4.559},
-    publisher = {MIT Press},
-    address = {Cambridge, MA, USA},
-}
-
-@article{Fisher-1936,
-    author = {Ronald  A. Fisher},
-    journal = {Annals of Eugenics},
-    pages = {179--188},
-    title = {The use of multiple measurements in taxonomic problems},
-    volume = {7},
-    year = {1936}
-}
-
-@Book{Fischer90,
-  author =       "K. H. Fischer and J. A. Hertz",
-  title =        "Spin Glasses",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge",
-  year =         "1990",
-}
-
-@TechReport{Fix+Hodges-51,
-  author =       "E. Fix and J. L. Hodges",
-  title =        "Discriminatory analysis, non-parametric
-                 discrimination, consistency properties",
-  number =       "Report 21-49-004",
-  institution =  "{USAF} School of Aviation Medicine, Randolph Field,
-                 Texas",
-  year =         "1951",
-}
-
-@Article{FixHodges51,
-  author =       "Evelyn Fix and Joseph L. Hodges Jr.",
-  title =        "Discriminatory Analysis: Nonparametric discrimination:
-                 Consistency properties",
-  journal =      "USAF School of Aviation Medecine",
-  volume =       "4",
-  pages =        "261--279",
-  year =         "1951",
-}
-
-@Article{FixHodges52,
-  author =       "Evelyn Fix and Joseph L. Hodges Jr.",
-  title =        "Discriminatory Analysis: Nonparametric discrimination:
-                 Small sample performance",
-  journal =      "USAF School of Aviation Medecine",
-  volume =       "11",
-  pages =        "280--322",
-  year =         "1952",
-}
-
-@MastersThesis{Flammia91,
-  author =       "G. Flammia",
-  title =        "Speaker Independent Consonant Recognition in
-                 Continuous Speech with Distinctive Phonetic Features",
-  school =       "McGill University, School of Computer Science",
-  year =         "1991",
-}
-
-@Book{Flanagan72,
-  author =       "J. L. Flanagan",
-  title =        "Speech Analysis, Synthesis, and Perception",
-  publisher =    "Springer--Verlag",
-  address =      "Berlin",
-  edition =      "2nd",
-  year =         "1972",
-}
-
-@Book{Fletcher87,
-  author =       "Roger Fletcher",
-  title =        "Practical Methods of Optimization",
-  publisher =    "Wiley",
-  address =      "New York",
-  edition =      "Second",
-  year =         "1987",
-}
-
-@InCollection{FleuretF2006,
-  author =       "Francois Fleuret and Gilles Blanchard",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Pattern Recognition from One Example by Chopping",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "371--378",
-  year =         "2006",
-}
-
-@InProceedings{Foldiak89,
-  author =       "P. F{\"o}ldi\'ak",
-  booktitle =    ijcnn,
-  title =        "Adaptive Network for Optimal Linear Feature
-                 Extraction",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "401--405",
-  year =         "1989",
-}
-
-@Article{Foldiak91,
-  author =       "P. F{\"o}ldi\'ak",
-  title =        "Learning Invariance from Transformation Sequences",
-  journal =      "Neural Computation",
-  volume =       "3",
-  number =       "2",
-  pages =        "194--200",
-  year =         "1991",
-}
-
-@TechReport{Fontaine,
-  author =       "T. Fontaine",
-  title =        "{GRAD}-{CM2}: {A} Data-parallel Connectionist Network
-                 Simulator",
-  number =       "MS-CIS-92-55/LINC LAB 232",
-  institution =  "University of Pennsylvania",
-  month =        jul,
-  year =         "1992",
-  OPTnote =      "",
-}
-
-@Article{Foster+George94,
-  author =       "D. Foster and E. George",
-  title =        "The risk inflation criterion for multiple regression",
-  journal =      "Annals of Statistics",
-  volume =       "22",
-  pages =        "1947--1975",
-  year =         "1994",
-}
-
-@PhdThesis{Foster2002,
-  author =       "George Foster",
-  title =        "Text Prediction for Translators",
-  school =       "Dept. IRO, Université de Montréal",
-  year =         "2002",
-}
-
-@incollection{Fox-2009,
- title = {Nonparametric Bayesian Learning of Switching Linear Dynamical Systems},
- author = {Emily Fox and Erik Sudderth and Michael Jordan and Alan Willsky},
- booktitle = NIPS21,
- editor = NIPS21ed,
- pages = {457--464},
- year = {2009}
-}
-
-@Article{Fralick67,
-  author = 	 {Stanley C. Fralick},
-  title = 	 {Learning to Recognize Patterns without a Teacher},
-  journal = 	 {IEEE Transactions on Information Theory},
-  year = 	 1967,
-  volume =	 13,
-  pages =	 {57-64}
-}
-
-@InProceedings{Franzini87,
-  author =       "M. A. Franzini",
-  booktitle =    "Proceedings of the Ninth Annual Conference of the IEEE
-                 Engineering in Medicine and Biology Society",
-  title =        "Speech Recognition with Back Propagation",
-  publisher =    "IEEE, New York",
-  address =      "Boston 1987",
-  pages =        "1702--1703",
-  year =         "1987",
-}
-
-@InProceedings{Franzini90,
-  author =       "M. A. Franzini and K. F. Lee and A. Waibel",
-  booktitle =    icassp,
-  title =        "Connectionist {Viterbi} Training: a New Hybrid Method
-                 for Continuous Speech Recognition",
-  address =      "Albuquerque, NM",
-  pages =        "425--428",
-  year =         "1990",
-}
-
-@InProceedings{Frasconi-icnn93,
-  author =       "P. Frasconi and M. Gori and A. Tesi",
-  booktitle =    icnn,
-  title =        "Backpropagation for Linearly Separable Patterns: a
-                 Detailed Analysis",
-  publisher =    "IEEE Press",
-  address =      "S. Francisco CA",
-  pages =        "1818--1822",
-  year =         "1993",
-}
-
-@InProceedings{Frasconi-ijcnn91,
-  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
-  booktitle =    ijcnn,
-  title =        "A Unified Approach for Integrating Explicit Knowledge
-                 and Learning by Example in Recurrent Networks",
-  pages =        "811--816",
-  year =         "1991",
-  OPTaddress =   "Seattle WA",
-}
-
-@Article{Frasconi-ijmpC93,
-  author =       "P. Frasconi and M. Gori and G. Soda",
-  title =        "Daphne: Data Parallelism Neural Network Simulator",
-  journal =      "Int. Journal of Modern Physics C",
-  volume =       "4",
-  number =       "1",
-  pages =        "17--28",
-  year =         "1993",
-  note =         "Special Issue: ``Science on the Connection Machine''",
-}
-
-@InProceedings{Frasconi-milano,
-  author =       "P. Frasconi and M. Gori and G. Soda",
-  booktitle =    "Computational Intelligence 90",
-  title =        "Recurrent Networks for Continuous Speech Recognition",
-  publisher =    "Elsevier",
-  address =      "Milano (Italy)",
-  year =         "1990",
-}
-
-@MastersThesis{Frasconi-msthesis,
-  author =       "P. Frasconi",
-  title =        "Progetto e realizzazione di un simulatore per reti
-                 neurali ricorrenti e implementazione di prototipi per
-                 il riconoscimento vocale in tempo reale",
-  school =       "Universit\`a di Firenze",
-  year =         "1990",
-  note =         "(in Italian)",
-}
-
-@Article{Frasconi-nc92,
-  author =       "P. Frasconi and M. Gori and G. Soda",
-  title =        "Local Feedback Multi-Layered Networks",
-  journal =      nc,
-  volume =       "4",
-  number =       "1",
-  pages =        "120--130",
-  year =         "1992",
-}
-
-@PhdThesis{Frasconi-PhD,
-  author =       "Paolo Frasconi",
-  title =        "Reti Ricorrenti ed Elaborazione Adattiva di Sequenze",
-  school =       "Universit\`a di Firenze",
-  address =      "Italy",
-  year =         "1994",
-  note =         "(in Italian)",
-}
-
-@InCollection{Frasconi-pinn93,
-  author =       "P. Frasconi and M. Gori and A. Tesi",
-  editor =       "Omid Omidvar",
-  booktitle =    "Progress in Neural Networks",
-  title =        "Successes and Failures of Backpropagation: a
-                 Theoretical Investigation",
-  publisher =    "Ablex Publishing",
-  year =         "1993",
-}
-
-@InProceedings{Frasconi-spie93,
-  author =       "Paolo Frasconi and Marco Gori",
-  editor =       "D. Ruck",
-  booktitle =    "Proc. Conf. Science of Artificial Neural Networks II",
-  title =        "Multilayered networks and the {C}-{G} uncertainty
-                 principle",
-  volume =       "SPIE-1966",
-  organization = "International Society for Optical Engineering (SPIE)",
-  address =      "Orlando, FL",
-  year =         "1993",
-}
-
-@TechReport{Frasconi-TR92,
-  author =       "P. Frasconi and M. Gori and G. Soda",
-  title =        "Injecting Nondeterministic Finite State Automata into
-                 Recurrent Neural Networks",
-  number =       "DSI-RT15/92",
-  institution =  "Universit\`a di Firenze (Italy)",
-  month =        aug,
-  year =         "1992",
-}
-
-@Unpublished{Frasconi-unp94,
-  author =       "P. Frasconi and Y. Bengio",
-  title =        "An {EM} Approach to Grammatical Inference",
-  year =         "1994",
-  note =         "Submitted to the 12-th {\em International Conference
-                 on Pattern Recognition}",
-  OPTannote =    "",
-}
-
-@InProceedings{Frasconi-v91,
-  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
-  editor =       "E. Caianiello",
-  booktitle =    "Proc. of the 4th Italian Workshop on Parallel
-                 Architectures and Neural Networks",
-  title =        "Learning Automata with Sigmoidal Networks",
-  publisher =    "World Scientific Pub",
-  address =      "Vietri (Italy)",
-  pages =        "69--77",
-  year =         "1991",
-}
-
-@InProceedings{Frasconi90,
-  author =       "P. Frasconi and M. Gori and G. Soda",
-  editor =       "E. Caianiello",
-  booktitle =    "Proc. of the 3rd Italian Workshop on Parallel
-                 Architectures and Neural Networks",
-  title =        "Recurrent Networks with Activation Feedback",
-  publisher =    "World Scientific Pub",
-  address =      "Vietri (Italy)",
-  pages =        "329--335",
-  year =         "1990",
-}
-
-@InProceedings{Frasconi97,
-  author =       "P. Frasconi and M. Gori and A. Sperduti",
-  booktitle =    "Proc. Int. Joint Conf. on Artificial Intelligence",
-  title =        "On the Efficient Classification of Data Structures by
-                 Neural Networks",
-  year =         "1997",
-}
-
-@Article{Frasconi-kde93,
-  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
-  title =        "Unified Integration of Explicit Rules and Learning by
-                 Example in Recurrent Networks",
-  journal =      ieeetrkde,
-  year =         "1993",
-  note =         "(in press)",
-}
-
-@Article{Frean90,
-  author =       "M Frean",
-  title =        "The Upstart Algorithm: {A} Method for Constructing and
-                 Training Feedforward Neural Networks",
-  journal =      nc,
-  volume =       "2",
-  pages =        "198--209",
-  year =         "1990",
-}
-
-@TechReport{Freund+Haussler-94,
-  author =       "Yoav Freund and David Haussler",
-  title =        "Unsupervised learning of distributions on binary
-                 vectors using two layer networks",
-  number =       "UCSC-CRL-94-25",
-  institution =  "University of California, Santa Cruz",
-  year =         "1994",
-}
-
-@InProceedings{Freund+Haussler92,
-  author =       "Yoav Freund and David Haussler",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "A fast and exact learning rule for a restricted class
-                 of {Boltzmann} machines",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "912--919",
-  year =         "1992",
-}
-
-@Article{Freund-Schapire-98,
-  author =       "Yoav Freund and Robert E. Schapire",
-  title =        "Adaptive Game Playing using Multiplicative Weights",
-  journal =      "Games and Economic Behavior",
-  year =         "1998",
-}
-
-@InProceedings{Freund1995,
-  author =       "Yoav Freund and Robert E. Schapire",
-  booktitle =    "Proceedings of the Second European Conference on
-                 Computational Learning Theory",
-  title =        "A decision-theoretic generalization of on-line
-                 learning and an application to boosting",
-  publisher =    "Springer-Verlag",
-  pages =        "23--37",
-  year =         "1995",
-  ISBN =         "3-540-59119-2",
-}
-
-@TechReport{freund94,
-  author =       "Y. Freund and D. Haussler",
-  title =        "Unsupervised learning of distributions of binary
-                 vectors using two layer networks",
-  number =       "CRL-94-25",
-  institution =  "UCSC",
-  year =         "1994",
-}
-
-@Unpublished{Freund97,
-  author =       "Y. Freund and R. E. Schapire and P. Bartlett and W. S.
-                 Lee",
-  title =        "Boosting the margin: {A} new explanation for the
-                 effectiveness of voting methods",
-  year =         "1997",
-  note =         "Presented at the Machines that Learn Conference,
-                 Snowbird, Utah",
-}
-
-@InProceedings{Frey96,
-  author =       "Brendan J. Frey and Geoffrey E. Hinton and Peter Dayan",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Does the wake-sleep algorithm learn good density estimators?",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "661--670",
-  year =         "1996",
-}
-
-@InProceedings{Frey-Hinton96,
-  author =       "B. J. Frey and G. E. Hinton",
-  booktitle =    "Proceedings of the Data Compression Conference",
-  title =        "Free Energy Coding",
-  publisher =    "IEEE Computer Society Press",
-  address =      "Los Alamitos, CA",
-  pages =        "",
-  year =         "1997",
-}
-
-@Book{Frey98,
-  author =       "Brendan J. Frey",
-  title =        "Graphical models for machine learning and digital
-                 communication",
-  publisher =    "{MIT} Press",
-  year =         "1998",
-}
-
-@InProceedings{frey99estimating,
-  author =       "B. J. Frey and N. Jojic",
-  booktitle =    cvpr99,
-  title =        "Estimating Mixture Models of Images and Inferring
-                 Spatial Transformations Using the {EM} Algorithm",
-  pages =        "416--422",
-  year =         "1999",
-  URL =          "citeseer.ist.psu.edu/frey99estimating.html",
-}
-
-@InProceedings{FreyUAI00,
-  author =       "Brendan Frey and Nebojsa Jojic",
-  booktitle =    UAI00,
-  title =        "Learning Graphical Models of Images, Videos and Their
-                 Spatial Transformations",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Francisco, CA",
-  pages =        "184--1",
-  year =         "2000",
-}
-
-@Article{Friedman+Fisher-99,
-  author =       "J. H. Friedman and N. I. Fisher",
-  title =        "Bump hunting in high-dimensional data",
-  journal =      "Statistics and Computing",
-  volume =       "9",
-  number =       "2",
-  pages =        "123--143",
-}
-
-@Article{Friedman+Hastie+Tibshirani:AdaBoost-theory,
-  author =       "J. Friedman and T. Hastie and R. Tibshirani",
-  title =        "Additive Logistic Regression: a Statistical View of
-                 Boosting",
-  journal =      "The Annals of Statistics",
-  volume =       "28",
-  pages =        "307--337",
-  year =         "2000",
-}
-
-@Article{Friedman-2001,
-  author =       "J. Friedman",
-  title =        "Greedy function approximation: a gradient boosting
-                 machine",
-  journal =      "Annals of Statistics",
-  volume =       "29",
-  pages =        "1180",
-  year =         "2001",
-}
-
-@Book{Friedman71,
-  author =       "A. Friedman",
-  title =        "Advanced Calculus",
-  publisher =    "Holt, Rinehart and Winston",
-  address =      "New York, NY",
-  year =         "1971",
-}
-
-@article{Friedman+Tukey-1974,
-    author = {J. H. Friedman and J. W. Tukey},
-    title = {A Projection Pursuit Algorithm for Exploratory Data Analysis},
-    journal = {IEEE Transactions on Computers},
-    volume = {23},
-    number = {9},
-    year = {1974},
-    issn = {0018-9340},
-    pages = {881--890},
-    doi = {http://dx.doi.org/10.1109/T-C.1974.224051},
-    publisher = {IEEE Computer Society},
-    address = {Washington, DC, USA},
-}
-
-@Article{Friedman87,
-  author =       "J. H. Friedman",
-  title =        "Exploratory projection pursuit",
-  journal =      "Journal of the American Statistical Association",
-  volume =       "92",
-  pages =        "249--266",
-  year =         "1987",
-}
-
-@Article{Friedman91,
-  author =       "J. H. Friedman",
-  title =        "Multivariate adaptive regression splines",
-  journal =      "The Annals of Statistics",
-  volume =       "19",
-  pages =        "1--141",
-  year =         "1991",
-}
-
-@TechReport{friedman94flexible,
-  author =       "J. Friedman",
-  title =        "Flexible metric nearest neighbor classification",
-  number =       "113",
-  institution =  "Stanford University Statistics Department",
-  year =         "1994",
-}
-
-@TechReport{Friedman98,
-  author =       "J. Friedman and T. Hastie and R. Tibshirani",
-  title =        "Additive logistic regression: {A} statistical view of
-                 boosting",
-  institution =  "Stanford University",
-  address =      "CA, USA",
-  year =         "1998",
-}
-
-@Misc{friedman99greedy,
-  author =       "J. Friedman",
-  title =        "Greedy Function Approximation: a Gradient Boosting
-                 Machine",
-  year =         "1999",
-  note =         "IMS 1999 Reitz Lecture, February 24, 1999, Dept. of
-                 Statistics, Stanford University",
-}
-
-@InProceedings{Friess98,
-  author =       "T. Friess and N. Cristianini and C. Campbel",
-  booktitle =    "Proceedings of the Fifteenth International Conference
-                 on Machine Learning",
-  title =        "The Kernel-Adatron: a Fast and Simple Learning
-                 Procedure for Support Vector Machines",
-  pages =        "188--196",
-  year =         "1998",
-}
-
-@InProceedings{Fritzke94,
-  author =       "B. Fritzke",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Supervised learning with growing cell structures",
-  publisher =    "Morgan Kaufmann",
-  year =         "1994",
-}
-
-@InProceedings{fs-lmcpa-98,
-  author =       "Yoav Freund and Robert E. Schapire",
-  booktitle =    "Proc. 11th Annu. Conf. on Comput. Learning Theory",
-  title =        "Large margin classification using the perceptron
-                 algorithm",
-  publisher =    "ACM Press, New York, NY",
-  pages =        "209--217",
-  year =         "1998",
-}
-
-@Article{fs-ppr-81,
-  author =       "J. H. Friedman and W. Stuetzle",
-  title =        "Projection Pursuit Regression",
-  journal =      "J. American Statistical Association",
-  volume =       "76",
-  number =       "376",
-  pages =        "817--823",
-  month =        dec,
-  year =         "1981",
-  comment =      "Good description of projection pursuit",
-}
-
-@Article{Fu86,
-  author =       "Y. Fu and P. W. Anderson",
-  title =        "Application of Statistical Mechanics to {NP}-Complete
-                 Problems in Combinatorial Optimization",
-  journal =      jpa,
-  volume =       "19",
-  pages =        "1605--1620",
-  year =         "1986",
-}
-
-@InProceedings{Fukumizu96,
-  author =       "K. Fukumizu",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Active Learning in Multilayer Perceptrons",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@Article{Fukumizu+Amari-2000,
-  author =      "Kenji Fukumizu and {Shun-ichi} Amari",
-  title =       "Local Minima and Plateaus in Hierarchical Structures of Multilayer Perceptrons",
-  journal =     "Neural Networks",
-  volume =      "13",
-  number =      "3",
-  pages =       "317--327",
-  year =        "2000",
-}
-
-@Article{Fukushima75,
-  author =       "K. Fukushima",
-  title =        "Cognitron: {A} Self-Organizing Multilayered Neural
-                 Network",
-  journal =      biocyb,
-  volume =       "20",
-  pages =        "121--136",
-  year =         "1975",
-}
-
-@Article{Fukushima80,
-  author =       "K. Fukushima",
-  title =        "Neocognitron: {A} Self-Organizing Neural Network Model
-                 for a Mechanism of Pattern Recognition Unaffected by
-                 Shift in Position",
-  journal =      biocyb,
-  volume =       "36",
-  pages =        "193--202",
-  year =         "1980",
-}
-
-@Article{Fukushima82,
-  author =       "K. Fukushima and S. Miyake",
-  key =          "Fukushima",
-  title =        "Neocognitron: {A} new algorithm for pattern
-                 recognition tolerant of deformations and shifts in
-                 position",
-  journal =      "Pattern Recognition",
-  volume =       "15",
-  pages =        "455--469",
-  year =         "1982",
-}
-
-@Article{Fukushima83,
-  author =       "K. Fukushima and S. Miyake and T. Ito",
-  title =        "Neocognitron: {A} Neural Network Model for a Mechanism
-                 of Visual Pattern Recognition",
-  journal =      ieeesmc,
-  volume =       "13",
-  year =         "1983",
-}
-
-@Article{Funahashi89,
-  author =       "K. Funahashi",
-  title =        "On the approximate realization of continuous mappings
-                 by neural networks",
-  journal =      "Neural Networks",
-  volume =       "2",
-  pages =        "183--192",
-  year =         "1989",
-}
-
-@Article{Funahashi93,
-  author =       "Ken-Ichi Funahashi and Yuichi Nakamura",
-  title =        "Approximation of Dynamical Systems by Continuous Time
-                 Recurrent Neural Networks",
-  journal =      nn,
-  volume =       "6",
-  pages =        "801--806",
-  year =         "1993",
-}
-
-@InProceedings{Fung-Crawford90,
-  author =       "R. M. Fung and S. L. Crawford",
-  booktitle =    "Eighth National Conference on Artificial Intelligence,
-                 Boston, Massachusetts, American Association for
-                 Artificial Intelligence",
-  title =        "A system for induction of probabilistic models",
-  pages =        "762--779",
-  year =         "1990",
-}
-
-@TechReport{Galland+Hinton89,
-  author =       "C. C. Galland and G. E. Hinton",
-  title =        "Deterministic learning in networks with asymmetric
-                 connectivity",
-  number =       "CRG-TR-89-6",
-  institution =  "Department of Computer Science, University of
-                 Toronto",
-  address =      "Toronto, Ontario",
-  year =         "1989",
-}
-
-@InProceedings{Gallant86,
-  author =       "S. I. Gallant",
-  booktitle =    "Eighth International Conference on Pattern
-                 Recognition",
-  title =        "Optimal Linear Discriminants",
-  publisher =    "IEEE, New York",
-  address =      "Paris 1986",
-  pages =        "849--852",
-  year =         "1986",
-}
-
-@Misc{gallant90perceptron-based,
-  author =       "S. Gallant",
-  title =        "Perceptron-based learning algorithms",
-  year =         "1990",
-  text =         "S. Gallant, Perceptron-based learning algorithms, IEEE
-                 Trans. Neural Networks 1, 179 (1990).",
-}
-
-@InProceedings{Gallinari87,
-  author =       "Patrick Gallinari and Yann {LeCun} and Sylvie Thiria and
-                 Francoise Fogelman-Soulie",
-  booktitle =    "Proceedings of COGNITIVA 87",
-  title =        "Memoires associatives distribuees",
-  address =      "Paris, La Villette",
-  year =         "1987",
-}
-
-@InProceedings{Gallinari88,
-  author =       "P. Gallinari and S. Thiria and F. Fogelman-Souli\'e",
-  booktitle =    "Proc. International Conference on Neural Networks
-                 '88",
-  title =        "Multilayer perceptrons and data analysis",
-  publisher =    "IEEE",
-  pages =        "391--399",
-  year =         "1988",
-}
-
-@InCollection{Gao-Goodman-Miao-2001,
-  author =       "J. Gao and J. Goodman and J. Miao",
-  booktitle =    "Computational Linguistics and Chinese Language
-                 Processing",
-  title =        "The Use of Clustering Techniques for Asian Language
-                 Modeling",
-  volume =       "6",
-  number =       "1",
-  pages =        "27--60",
-  year =         "2001",
-}
-
-@TechReport{Garcia-Perron95,
-  author =       "R. Garcia and P. Perron",
-  title =        "An analysis of the real interest rate under regime
-                 shift",
-  number =       "95s-5",
-  institution =  "CIRANO",
-  address =      "Montreal, Quebec, Canada",
-  year =         "1995",
-}
-
-@Article{Garcia-Perron96,
-  author =       "R. Garcia and P. Perron",
-  title =        "An analysis of the real interest rate under regime
-                 shift",
-  journal =      "The Review of Economics and Statistics",
-  year =         "1996",
-}
-
-@TechReport{Garcia-Schaller95,
-  author =       "R. Garcia and H. Schaller",
-  title =        "Are the effects of monetary policy asymmetric",
-  number =       "95s-6",
-  institution =  "CIRANO",
-  address =      "Montreal, Quebec, Canada",
-  year =         "1995",
-}
-
-@TechReport{Garcia95,
-  author =       "R. Garcia",
-  title =        "Asymptotic null distribution of the likelihood ratio
-                 test in Markov switching models",
-  number =       "95s-7",
-  institution =  "CIRANO",
-  address =      "Montreal, Quebec, Canada",
-  year =         "1995",
-}
-
-@TechReport{Garcia98,
-  author =       "R. Garcia and R. Gen\c{c}ay",
-  title =        "{Pricing and Hedging Derivative Securities with Neural
-                 Networks and a Homogeneity Hint}",
-  number =       "98s-35",
-  institution =  "CIRANO",
-  address =      "Montr\'eal, Qu\'ebec, Canada",
-  year =         "1998",
-}
-
-@Article{Gardner87,
-  author =       "E. Gardner",
-  title =        "Maximum Storage Capacity in Neural Networks",
-  journal =      eul,
-  volume =       "4",
-  pages =        "481--485",
-  year =         "1987",
-}
-
-@Article{Gardner88a,
-  author =       "E. Gardner",
-  title =        "The Space of Interactions in Neural Network Models",
-  journal =      jpa,
-  volume =       "21",
-  pages =        "257--270",
-  year =         "1988",
-}
-
-@Article{Gardner88b,
-  author =       "E. Gardner and B. Derrida",
-  title =        "Optimal Storage Properties of Neural Network Models",
-  journal =      jpa,
-  volume =       "21",
-  pages =        "271--284",
-  year =         "1988",
-}
-
-@Article{Gardner89a,
-  author =       "E. Gardner and B. Derrida",
-  title =        "Three Unfinished Works on the Optimal Storage Capacity
-                 of Networks",
-  journal =      jpa,
-  volume =       "22",
-  pages =        "1983--1994",
-  year =         "1989",
-}
-
-@Article{Gardner89b,
-  author =       "E. Gardner and H. Gutfreund and I. Yekutieli",
-  title =        "The Phase Space of Interactions in Neural Networks
-                 with Definite Symmetry",
-  journal =      jpa,
-  volume =       "22",
-  pages =        "1995--2008",
-  year =         "1989",
-}
-
-@Book{Garey79,
-  author =       "M. R. Garey and D. S. Johnson",
-  title =        "Computers and Intractability: {A} Guide to the Theory
-                 of {NP}-Completeness",
-  publisher =    "Freeman",
-  address =      "New York",
-  year =         "1979",
-}
-
-@InCollection{GarriguesP2008,
-  author =       "Pierre Garrigues and Bruno Olshausen",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Learning Horizontal Connections in a Sparse Coding
-                 Model of Natural Images",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "505--512",
-  year =         "2008",
-}
-
-@InCollection{GarriguesP2008-small,
-  author =       "Pierre Garrigues and Bruno Olshausen",
-  booktitle =    "NIPS'20",
-  title =        "Learning Horizontal Connections in a Sparse Coding
-                 Model of Natural Images",
-  year =         "2008",
-}
-
-@Article{Gartner03,
-  author =       "T. G{\"a}rtner",
-  title =        "A survey of kernels for structured data",
-  journal =      "ACM SIGKDD Explorations Newsletter",
-  volume =       "5",
-  number =       "1",
-  pages =        "49--58",
-  year =         "2003",
-}
-
-@InProceedings{Gauvain:2003:icassp,
-  author =       "Jean-Luc Gauvain and L. Lamel and Holger Schwenk and
-                 G. Adda and L. Chen and F.\ Lef\`evre",
-  booktitle =    icassp,
-  title =        "Conversational Telephone Speech Recognition",
-  volume =       "1",
-  pages =        "212--215",
-  year =         "2003",
-}
-
-@InProceedings{Gaynier93,
-  author =       "R. J. Gaynier and T. Downs",
-  booktitle =    "IEEE International Conference on Neural Networks",
-  title =        "A Method of Training Multi-layer Networks with
-                 Heaviside Characteristics Using Internal
-                 Representations",
-  address =      "San Francisco, CA",
-  pages =        "1812--1817",
-  year =         "1993",
-}
-
-@InProceedings{GehlerP2006,
-  author =       "Peter V. Gehler and Alex D. Holub and Max Welling",
-  booktitle =    ICML06,
-  editor =       ICML06ed,
-  publisher =    ICML06publ,
-  title =        "The rate adapting poisson model for information
-                 retrieval and object recognition",
-  address =      "New York, NY, USA",
-  pages =        "337--344",
-  year =         "2006",
-  ISBN =         "1-59593-383-2",
-  doi =          "http://doi.acm.org/10.1145/1143844.1143887",
-  location =     "Pittsburgh, Pennsylvania",
-}
-
-@Article{Geman84,
-  author =       {Geman, Stuart and Geman, Donald},
-  title =        "Stochastic Relaxation, Gibbs Distributions, and the
-                 {Bayesian} Restoration of Images",
-  doi =          {10.1080/02664769300000058},
-  journal =      ieeetpami,
-  volume =       "6",
-  keywords =     {annealing, mrf, simulated},
-  month =        {November},
-  pages =        {721--741},
-  url =          {http://dx.doi.org/10.1080/02664769300000058},
-  year =         "1984",
-}
-
-@Article{Geman92,
-  author =       "S. Geman and E. Bienenstock and R. Doursat",
-  title =        "Neural Networks and the Bias/Variance Dilemma",
-  journal =      nc,
-  volume =       "4",
-  number =       "1",
-  pages =        "1--58",
-  year =         "1992",
-}
-
-@Article{Genest-Zideck-86,
-  author =       "C. Genest and J. V. Zideck",
-  title =        "Combining probability distributions: {A} critique and
-                 an annotated bibliography",
-  journal =      "Statistical Science",
-  volume =       "1",
-  pages =        "114--148",
-  year =         "1986",
-}
-
-@article{Geng+al-2005,
-    author    = {Xin Geng and De-Chuan Zhan and Zhi-Hua Zhou},
-    title     = {Supervised nonlinear dimensionality reduction for visualization and classification},
-    journal   = {IEEE Transactions on Systems, Man, and Cybernetics, Part B},
-    volume    = {35},
-    number    = {6},
-    year      = {2005},
-    pages     = {1098-1107},
-    ee        = {http://dx.doi.org/10.1109/TSMCB.2005.850151},
-    bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@Article{Geszti87,
-  author =       "T. Geszti and F. P\'azm\'andi",
-  title =        "Learning Within Bounds and Dream Sleep",
-  journal =      jpa,
-  volume =       "20",
-  pages =        "L1299--L1303",
-  year =         "1987",
-}
-
-@Book{Geszti90,
-  author =       "T. Geszti",
-  title =        "Physical Models of Neural Networks",
-  publisher =    "World Scientific",
-  address =      "Singapore",
-  year =         "1990",
-}
-
-@Article{Geweke1989,
-  author =       "J. Geweke",
-  title =        "Bayesian inference in econometric models using Monte
-                 carlo integration",
-  journal =      "Econometrica",
-  volume =       "57",
-  pages =        "1317--1339",
-  year =         "1989",
-}
-
-@InCollection{Gha94,
-  author =       "Z. Ghahramani",
-  booktitle =    "Proceedings of the 1993 Connectionist Models Summer
-                 School",
-  title =        "Solving inverse problems using an {EM} approach to
-                 density estimation",
-  publisher =    "Erlbaum",
-  address =      "Hillsdale, NJ",
-  year =         "1994",
-}
-
-@InProceedings{ghabea00,
-  author =       "Z. Ghahramani and M. J. Beal",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Variational inference for {Bayesian} mixtures of
-                 factor analysers",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2000",
-  URL =          "citeseer.nj.nec.com/article/ghahramani00variational.html",
-}
-
-@TechReport{ghahramani96em,
-  author =       "Z. Ghahramani and G. E. Hinton",
-  title =        "The {EM} Algorithm for Mixtures of Factor Analyzers",
-  number =       "CRG-TR-96-1",
-  institution =  "Dpt. of Comp. Sci., Univ. of Toronto",
-  month =        jan,
-  year =         "1996",
-  URL =          "citeseer.nj.nec.com/ghahramani97em.html",
-}
-
-@TechReport{GhaJor93,
-  author =       "Z. Ghahramani and M. I. Jordan",
-  title =        "Function approximation via density estimation",
-  type =         "Computational Cognitive Science",
-  number =       "TR 9304",
-  institution =  "MIT",
-  address =      "Cambridge, MA",
-  year =         "1993",
-}
-
-@InProceedings{Gherrity89,
-  author =       "M. Gherrity",
-  booktitle =    ijcnn,
-  title =        "A Learning Algorithm for Analog, Fully Recurrent
-                 Neural Networks,",
-  publisher =    "IEEE Press",
-  address =      "Washington D.C.",
-  pages =        "643--644",
-  month =        jun,
-  year =         "1989",
-}
-
-@Article{Ghosh+Hwang-1989,
-  author =       "J. Ghosh and K. Hwang",
-  title =        "Mapping Neural Networks onto Message-Passing
-                 Multicomputers",
-  journal =      "Journal of Parallel and Distributed Computing",
-  volume =       "6",
-  number =       "2",
-  publisher =    "Academic Press",
-  pages =        "291--330",
-  year =         "1989",
-}
-
-@Article{Ghosn2003,
-  author =       "J. Ghosn and Y. Bengio",
-  title =        "Bias Learning, Knowledge Sharing",
-  journal =      "{IEEE} Transactions on Neural Networks",
-  volume =       "14",
-  pages =        "748--765",
-  month =        jul,
-  year =         "2003",
-  issue =        "4",
-}
-
-@TechReport{Ghysel93,
-  author =       "E. Ghysel",
-  title =        "A time series model with periodic stochastic regime
-                 switching",
-  number =       "C.R.D.E. Discussion paper 1093",
-  institution =  "C.R.D.E., Universite de Montreal",
-  address =      "Montreal, Quebec, Canada",
-  year =         "1993",
-}
-
-@book{Giarratano+Riley-2004,
-    author = {Giarratano, Joseph  C.  and Riley, Gary  D. },
-    howpublished = {Hardcover},
-    isbn = {0534384471},
-    month = {October},
-    posted-at = {2008-05-19 22:17:30},
-    priority = {2},
-    publisher = {{Course Technology}},
-    edition = {Fourth},
-    title = {Expert Systems: Principles and Programming},
-    url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0534384471},
-    year = {2004}
-}
-
-
-@Article{Giles86,
-  author =       "Y. C. Lee and G. Doolen and H. H. Chen and G. Z. Sun
-                 and T. Maxwell and H. Y. Lee and C. L. Giles",
-  title =        "Machine Learning Using a Higher Order Correlation
-                 Network",
-  journal =      "Physica D",
-  volume =       "2",
-  number =       "1-3",
-  pages =        "276",
-  year =         "1986",
-}
-
-@article{giles:1987, 
-    author = {C. Lee Giles and Tom Maxwell}, 
-    journal = {Applied Optics}, 
-    keywords = {},
-    number = {23}, 
-    pages = {4972}, 
-    publisher = {OSA},
-    title = {Learning, Invariance, and Generalization in High-Order Neural Networks}, 
-    volume = {26}, 
-    year = {1987},
-    url = {http://ao.osa.org/abstract.cfm?URI=ao-26-23-4972},
-}
-
-@InProceedings{Giles90,
-  author =       "C. L. Giles and G. Z. Sun and H. H. Chen and Y. C. Lee
-                 and D. Chen",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Higher Order Recurrent Networks \& Grammatical
-                 Inference",
-  publisher =    "Morgan Kaufmann Publishers",
-  address =      "San Mateo, CA",
-  pages =        "380--387",
-  year =         "1990",
-}
-
-@InProceedings{Giles-nnsp92,
-  author =       "C. L. Giles and C. W Omlin",
-  editor =       "Kung and Fallside and Sorenson and Kamm",
-  booktitle =    "Neural Networks for Signal Processing II, Proceedings
-                 of the 1992 IEEE workshop",
-  title =        "Inserting Rules into Recurrent Neural Networks",
-  publisher =    "IEEE Press",
-  pages =        "13--22",
-  year =         "1992",
-}
-
-@Article{Giles94,
-  author =       "C. L. Giles and C. W. Omlin",
-  title =        "Extraction, Insertion and Refinement of Symbolic Rules
-                 in Dynamically-Driven Recurrent Neural Networks",
-  journal =      "Connection Science",
-  pages =        "",
-  year =         "1994",
-}
-
-@Article{Giles-nc92,
-  author =       "C. L. Giles and C. B. Miller and D. Chen and G. Z. Sun
-                 and H. H. Chen and Y. C. Lee",
-  title =        "Learning and Extracting Finite State Automata with
-                 Second-Order Recurrent Neural Networks",
-  journal =      nc,
-  volume =       "4",
-  number =       "3",
-  pages =        "393--405",
-  year =         "1992",
-}
-
-@Book{Gill81,
-  author =       "P. E. Gill and W. Murray and M. H. Wright",
-  title =        "Practical Optimization",
-  publisher =    "Academic Press",
-  year =         "1981",
-}
-
-@InProceedings{Gillman+Sipser94,
-  author =       "David Gillman and Michael Sipser",
-  booktitle =    colt94,
-  title =        "Inference and minimization of hidden Marko chains",
-  publisher =    "ACM",
-  pages =        "147--158",
-  year =         "1994",
-}
-
-@Book{Gilmore-74,
-  author =       "R. Gilmore",
-  title =        "{Lie} groups, {Lie} algebras and some of their
-                 applications",
-  publisher =    "Wiley",
-  address =      "New-York",
-  year =         "1974",
-}
-
-@InProceedings{Gingras-Bengio-Nadeau-2000,
-  author =       "F. Gingras and Y. Bengio and C. Nadeau",
-  editor =       "",
-  booktitle =    "Computational Finance 2000",
-  title =        "On Out-of-Sample Statistics for Time-Series",
-  publisher =    "",
-  location =     "London, U.K.",
-  pages =        "",
-  year =         "2000",
-}
-
-@InProceedings{chapados+bengio-2000,
-  author =       "N. Chapados and Y. Bengio",
-  editor =       "",
-  booktitle =    "Computational Finance 2000",
-  title =        "{VaR}-based Asset Allocation using Neural Networks",
-  publisher =    "",
-  pages =        "",
-  year =         "2000",
-}
-
-@InProceedings{Pigeon+Bengio-99,
-  author =       "S. Pigeon and Y. Bengio",
-  editor =       "",
-  booktitle =    "Proceedings of the Data Compression Conference, DCC'1999",
-  title =        "Binary Pseudowavelets and Application to Bilevel Image Processing",
-  publisher =    "",
-  pages =        "",
-  year =         "1999",
-}
-
-@InProceedings{Girard+Paugam-Moisy-1994,
-  author =       "D. Girard and H\'{e}l\`{e}ne Paugam-Moisy",
-  booktitle =    "Proceedings of the {IFIP} {WG10.3} Working Conference
-                 on Applications in Parallel and Distributed Computing",
-  title =        "Strategies of Weight Updating for Parallel
-                 Back-propagation",
-  publisher =    "North-Holland Publishing Co.",
-  address =      "Amsterdam, The Netherlands",
-  pages =        "335--336",
-  year =         "1994",
-  ISBN =         "0-444-81870-7",
-}
-
-@InProceedings{Girju+al-2003,
-  author =       "Roxana Girju and Adriana Badulescu and Dan Moldovan",
-  booktitle =    "NAACL '03: Proceedings of the 2003 Conference of the
-                 North American Chapter of the Association for
-                 Computational Linguistics on Human Language
-                 Technology",
-  title =        "Learning semantic constraints for the automatic
-                 discovery of part-whole relations",
-  publisher =    "Association for Computational Linguistics",
-  address =      "Morristown, NJ, USA",
-  pages =        "1--8",
-  year =         "2003",
-  location =     "Edmonton, Canada",
-}
-
-@Article{Girolami-2001,
-  author =       "M. Girolami",
-  title =        "Orthogonal series density estimation and the kernel
-                 eigenvalue problem",
-  journal =      "Neural Computation",
-  volume =       "14",
-  number =       "3",
-  pages =        "669--688",
-  year =         "2001",
-}
-
-@Misc{girosi97an,
-  author =       "F. Girosi",
-  title =        "An equivalence between sparse approximation and
-                 Support Vector Machines",
-  year =         "1997",
-  text =         "F. Girosi. An equivalence between sparse approximation
-                 and Support Vector Machines. A.I. Memo 1606, MIT
-                 Artificial Intelligence Laboratory, 1997. (available at
-                 the URL:
-                 http://www.ai.mit.edu/people/girosi/svm.html).",
-}
-
-@Article{Glauber63,
-  author =       "R. J. Glauber",
-  title =        "Time-Dependent Statistics of the Ising Model",
-  journal =      jmp,
-  volume =       "4",
-  pages =        "294--307",
-  year =         "1963",
-}
-
-@Book{GLM-book-89,
-  author =       "P. McCullagh and J. Nelder",
-  title =        "Generalized Linear Models",
-  publisher =    "Chapman and Hall",
-  address =      "London",
-  year =         "1989",
-}
-
-@InCollection{GlobersonA2006,
-  author =       "Amir Globerson and Sam Roweis",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Metric Learning by Collapsing Classes",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "451--458",
-  year =         "2006",
-}
-
-@Book{Gluck90,
-  author =       "M. A. Gluck and D. E. Rumelhart",
-  title =        "Neuroscience and connectionist theory",
-  publisher =    "Lawrence Erlbaum, London",
-  year =         "1990",
-}
-
-@Article{Godin89,
-  author =       "C. Godin and P. Lockwood",
-  title =        "{DTW} Schemes for Continuous Speech Recognition: {A}
-                 Unified view",
-  journal =      cspla,
-  volume =       "3",
-  pages =        "169--198",
-  year =         "1989",
-}
-
-@book{Gold+Morgan-1999,
-    author = {Gold, Ben and Morgan, Nelson},
-    howpublished = {Hardcover},
-    isbn = {0471351547},
-    month = {July},
-    publisher = {Wiley},
-    title = {Speech and Audio Signal Processing: Processing and Perception of Speech and Music},
-    year = {1999}
-}
-
-@Book{Goldberg89,
-  author =       "D. E. Goldberg",
-  title =        "Genetic Algorithms in Search, Optimization, and
-                 Machine Learning",
-  publisher =    "Addison-Wesley",
-  address =      "Reading",
-  year =         "1989",
-}
-
-@Article{Goldfeld73,
-  author =       "S. M. Goldfeld and R. M. Quandt",
-  title =        "A Markov model for switching regressions",
-  journal =      "Journal of Econometrics",
-  volume =       "1",
-  pages =        "3--16",
-  year =         "1973",
-}
-
-@TechReport{Goldhor85,
-  author =       "R. S. Goldhor",
-  title =        "Representation of consonants in the peripheral
-                 auditory system: {A} modeling study of the
-                 correspondance between response properties and phonetic
-                 features",
-  number =       "505",
-  institution =  "RLE.",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1985",
-}
-
-@Article{Golomb90,
-  author =       "D. Golomb and N. Rubin and H. Sompolinsky",
-  title =        "Willshaw Model: Associative Memory with Sparse Coding
-                 and Low Firing Rates",
-  journal =      prA,
-  volume =       "41",
-  pages =        "1843--1854",
-  year =         "1990",
-}
-
-@Book{Golub+VanLoan-1996,
-  author =       "Gene H. Golub and Charles F. Van Loan",
-  title =        "Matrix Computations",
-  howpublished = "Paperback",
-  publisher =    "{The Johns Hopkins University Press}",
-  month =        oct,
-  year =         "1996",
-  ISBN =         "0-8018-5414-8",
-}
-
-@TechReport{Goodman-LM-2001,
-  author =       "Joshua Goodman",
-  title =        "A Bit of Progress in Language Modeling",
-  number =       "MSR-TR-2001-72",
-  institution =  "Microsoft Research",
-  address =      "Redmond, Washington",
-  year =         "2001",
-}
-
-@InProceedings{Goodman2001,
-  author =       "J. Goodman",
-  booktitle =    icassp,
-  title =        "Classes for Fast Maximum Entropy Training",
-  address =      "Utah",
-  year =         "2001",
-}
-
-@InProceedings{Gori-ijcnn89,
-  author =       "M. Gori and Y. Bengio and R. \mbox{De Mori}",
-  booktitle =    ijcnn,
-  title =        "{BPS}: {A} Learning Algorithm for Capturing the
-                 Dynamical Nature of Speech",
-  publisher =    "IEEE, New York",
-  address =      "Washington D.C.",
-  pages =        "643--644",
-  year =         "1989",
-}
-
-@InProceedings{Gori-nimes89,
-  author =       "M Gori",
-  booktitle =    "Proceedings of Neuro-Nimes",
-  title =        "An Extension of {BPS}",
-  address =      "Nimes (France)",
-  pages =        "83--93",
-  year =         "1989",
-}
-
-@Article{Gori-pami91,
-  author =       "M. Gori and A. Tesi",
-  title =        "On the problem of local minima in Backpropagation",
-  journal =      ieeetpami,
-  volume =       "PAMI-14",
-  number =       "1",
-  pages =        "76--86",
-  year =         "1992",
-}
-
-@TechReport{Gori-tr94,
-  author =       "M. Gori and M. Maggini and G. Soda",
-  title =        "Insertion of Finite State Automata into Recurrent
-                 Radial Basis Function Networks",
-  number =       "DSI-17/93",
-  institution =  "Universit\`a di Firenze (Italy)",
-  year =         "1993",
-  note =         "(submitted)",
-  OPTannote =    "",
-}
-
-@InProceedings{GoriNimes,
-  author =       "M. Gori",
-  booktitle =    "Proceedings of Neuro-Nimes",
-  title =        "An Extension of {BPS}",
-  address =      "Nimes (France)",
-  pages =        "83--93",
-  month =        nov,
-  year =         "1989",
-}
-
-@Article{Gorman88a,
-  author =       "R. P. Gorman and T. J. Sejnowski",
-  title =        "Analysis of Hidden Units in a Layered Network Trained
-                 to Classify Sonar Targets",
-  journal =      nn,
-  volume =       "1",
-  pages =        "75--89",
-  year =         "1988",
-}
-
-@Article{Gorman88b,
-  author =       "R. P. Gorman and T. J. Sejnowski",
-  title =        "Learned Classification of Sonar Targets Using a
-                 Massively-Parallel Network",
-  journal =      ieeetassp,
-  volume =       "36",
-  pages =        "1135--1140",
-  year =         "1988",
-}
-
-@Unpublished{Gorse94,
-  author =       "D. Gorse and J. G. Taylor and T. G. Clarkson",
-  title =        "A pulse-based reinforcement algorithm for learning
-                 continuous functions",
-  year =         "1994",
-  note =         "Submitted to WCNN '94 San Diego",
-}
-
-@Article{Goudreau-trnn93,
-  author =       "M. W. Goudreau and C. L. Giles and S. T. Chakradhar
-                 and D. Chen",
-  title =        "First-order vs. second-order single layer recurrent
-                 neural networks",
-  journal =      ieeetrnn,
-  year =         "1993",
-  note =         "(in press)",
-}
-
-@Article{Goudreau93tb,
-  author =       "M. W. Goudreau and C. L. Giles and S. T. Chakradhar
-                 and D. Chen",
-  title =        "First-Order Vs. Second-Order Single Layer Recurrent
-                 Neural Networks",
-  journal =      "IEEE Transactions on Neural Networks",
-  year =         "1993",
-}
-
-@inproceedings{Gould+al:NIPS09,
-  author = {S. Gould and T. Gao and D. Koller},
-  title = {Region-based Segmentation and Object Detection},
-  booktitle =    "Advances in Neural Information Processing Systems (NIPS 2009)",
-  year = 2009,
-}
-
-@Article{goutte97,
-  author =       "C. Goutte",
-  title =        "Note on free lunches and cross-validation",
-  journal =      "Neural Computation",
-  volume =       "9",
-  number =       "6",
-  pages =        "1053--1059",
-  year =         "1997",
-}
-
-@Article{Gower-68,
-  author =       "J. C. Gower",
-  title =        "Adding a point to vector diagrams in multivariate
-                 analysis",
-  journal =      "Biometrika",
-  volume =       "55",
-  number =       "3",
-  pages =        "582--585",
-  year =         "1968",
-}
-
-@InProceedings{Graepel2000,
-  author =       "Thore Graepel and Ralf Herbrich and John
-                 Shawe-Taylor",
-  booktitle =    "Thirteenth Annual Conference on Computational Learning
-                 Theory, 2000",
-  title =        "Generalization error bounds for sparse linear
-                 classifiers",
-  publisher =    "Morgan Kaufmann",
-  year =         "2000",
-  note =         "in press",
-}
-
-@InProceedings{Graepel99,
-  author =       "T. Graepel and R. Herbrich and P. Bollmann-Sdorra and
-                 K. Obermayer",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Classification on Pairwise Proximity Data",
-  year =         "1999",
-}
-
-@InProceedings{graf-90a,
-  author =       "H. P. Graf and D. Henderson",
-  booktitle =    "ISSCC Digest",
-  title =        "A Reconfigurable {CMOS} Neural Network",
-  organization = "ISSCC",
-  year =         "1990",
-}
-
-@InProceedings{Graf86,
-  author =       "H. P. Graf and L. D. Jackel and R. E. Howard and B.
-                 Straughn and J. S. Denker and W. Hubbard and D. M.
-                 Tennant and D. Schwartz",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "{VLSI} Implementation of a Neural Network Memory with
-                 Several Hundreds of Neurons",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "182--187",
-  year =         "1986",
-}
-
-@InProceedings{Graf88,
-  author =       "D. H. Graf and W. R. LaLonde",
-  booktitle =    icnn,
-  title =        "A Neural Controller for Collision-Free Movement of
-                 General Robot Manipulators",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "77--84",
-  year =         "1988",
-}
-
-@InProceedings{Graf92,
-  author =       "H. P. Graf and C. R. Nohl and J. Ben",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Image segmentation with networks of variable scales",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "480--487",
-  year =         "1992",
-}
-
-@InProceedings{Grandvalet98a,
-  author =       "Y. Grandvalet",
-  editor =       "L. Niklasson and M. Boden and T. Ziemske",
-  booktitle =    "ICANN'98",
-  title =        "Least absolute shrinkage is equivalent to quadratic
-                 penalization",
-  volume =       "1",
-  publisher =    "Springer",
-  pages =        "201--206",
-  year =         "1998",
-  series =       "Perspectives in Neural Computing",
-}
-
-@InProceedings{Grandvalet98a-short,
-  author =       "Y. Grandvalet",
-  booktitle =    "ICANN'98",
-  title =        "Least absolute shrinkage is equivalent to quadratic
-                 penalization",
-  pages =        "201--206",
-  year =         "1998",
-}
-
-@InProceedings{GrandvaletY2005,
-  author =       "Yves Grandvalet and Yoshua Bengio",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "{Semi-supervised Learning by Entropy
-                 Minimization}",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  month =        dec,
-  year =         "2005",
-}
-%deprecate this version as we need to put the date of publication not the date of the conference. use GrandvaletY2005 instead.
-@InProceedings{GrandvaletY2004,
-  author =       "Yves Grandvalet and Yoshua Bengio",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "{Semi-supervised Learning by Entropy
-                 Minimization}",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  month =        dec,
-  year =         "2005",
-}
-
-@INCOLLECTION {GrandvaletY2006,
-title = {Entropy Regularization},
-author = {Grandvalet, Yves and Bengio, Yoshua},
-editor = {Chapelle, Olivier and {Sch\"{o}lkopf}, Bernhard and Zien, Alexander},
-booktitle = {Semi-Supervised Learning},
-year = {2006},
-pages = {151--168},
-publisher = {{MIT} Press},
-}
-
-@Article{GrangerNewbold76,
-  author =       "C. W. J. Granger and P. Newbold",
-  title =        "Forecasting transformed series",
-  journal =      "J. Roy. Statist. Soc. B",
-  volume =       "38",
-  pages =        "189--203",
-  year =         "1976",
-}
-
-@InProceedings{Gray-Moore-2003,
-  author =       "Alexander Gray and Andrew Moore",
-  booktitle =    "Artificial Iintelligence and Statistics",
-  title =        "Rapid Evaluation of Multiple Density Models",
-  year =         "2003",
-}
-
-@Article{Gray84,
-  author =       "R. M. Gray",
-  title =        "Vector Quantization",
-  journal =      ieeeassp,
-  pages =        "4--29",
-  month =        apr,
-  year =         "1984",
-}
-
-@Article{Greenwood+Durand60,
-  author =       "T. A. Greenwood and D. Durand",
-  title =        "",
-  journal =      "Technometrics",
-  volume =       "2",
-  pages =        "55--56",
-  year =         "1960",
-}
-
-@InProceedings{GregoryD2007,
-  author =       "Gregory Druck and Chris Pal and Andrew Mccallum and
-                 Xiaojin Zhu",
-  booktitle =    "KDD '07: Proceedings of the 13th ACM SIGKDD
-                 international conference on Knowledge discovery and
-                 data mining",
-  title =        "Semi-supervised classification with hybrid
-                 generative/discriminative methods",
-  publisher =    "ACM",
-  address =      "New York, NY, USA",
-  pages =        "280--289",
-  year =         "2007",
-  OPTciteulike-article-id = "2304687",
-  OPTdoi =       "10.1145/1281192.1281225",
-  OPTisbn =      "9781595936097",
-  OPTkeywords =  "classification",
-  OPTpriority =  "2",
-}
-  %url =       "http://portal.acm.org/citation.cfm?id=1281192.1281225",
-
-@Article{Gribskov87,
-  author =       "M. Gribskov and M. McLachlan and D. Eisenber",
-  title =        "Profile analysis: detection of distantly related
-                 proteins",
-  journal =      PNAS,
-  volume =       "84",
-  pages =        "4355--4358",
-  year =         "1987",
-}
-
-@TechReport{Griffin-Holub-Perona-07,
-  author =       "Gregory Griffin and Alex Holub and Pietro Perona",
-  title =        "Caltech-256 Object Category Dataset",
-  number =       "Technical Report 7694",
-  institution =  "California Institute of Technology",
-  year =         "2007",
-}
-
-@Article{grigoriev95,
-  author =       "Dima Grigoriev and Marek Karpinski and Andrew Chi-Chih
-                 Yao",
-  title =        "An Exponential Lower Bound on the Size of Algebraic
-                 Decision Trees for {MAX}",
-  journal =      "Electronic Colloquium on Computational Complexity
-                 (ECCC)",
-  volume =       "2",
-  number =       "057",
-  year =         "1995",
-}
-
-@Article{Grimes-Rao-2005,
-  author =       "D. B. Grimes and R. P. N. Rao",
-  title =        "Bilinear Sparse Coding for Invariant Vision",
-  journal =      "Neural Computation",
-  volume =       "17",
-  number =       "1",
-  pages =        "47--73",
-  year =         "2005",
-}
-
-@Article{Grossberg67,
-  author =       "S. Grossberg",
-  title =        "Nonlinear Difference-Differential Equations in
-                 Prediction and Learning Theory",
-  journal =      PNAS,
-  volume =       "58",
-  pages =        "1329--1334",
-  year =         "1967",
-}
-
-@Article{Grossberg68a,
-  author =       "S. Grossberg",
-  title =        "Some Nonlinear Networks Capable of Learning a Spatial
-                 Pattern of Arbitrary Complexity",
-  journal =      PNAS,
-  volume =       "59",
-  pages =        "368--372",
-  year =         "1968",
-}
-
-@Article{Grossberg68b,
-  author =       "S. Grossberg",
-  title =        "Some Physiological and Biochemical Consequences of
-                 Psychological Postulates",
-  journal =      PNAS,
-  volume =       "60",
-  pages =        "758--765",
-  year =         "1968",
-}
-
-@Article{Grossberg69,
-  author =       "S. Grossberg",
-  title =        "Embedding Fields: {A} Theory of Learning with
-                 Physiological Implications",
-  journal =      jmpsych,
-  volume =       "6",
-  pages =        "209--239",
-  year =         "1969",
-}
-
-@Article{Grossberg72,
-  author =       "S. Grossberg",
-  title =        "Neural Expectation: Cerebellar and Retinal Analogs of
-                 Cells Fired by Learnable or Unlearned Pattern Classes",
-  journal =      kyb,
-  volume =       "10",
-  pages =        "49--57",
-  year =         "1972",
-}
-
-@Article{Grossberg76a,
-  author =       "S. Grossberg",
-  title =        "Adaptive Pattern Classification and Universal
-                 Recoding: {I}. Parallel Development and Coding of
-                 Neural Feature Detectors",
-  journal =      biocyb,
-  volume =       "23",
-  year =         "1976",
-}
-
-@Article{Grossberg76b,
-  author =       "S. Grossberg",
-  title =        "Adaptive Pattern Classification and Universal
-                 Recoding: {II}. Feedback, Expectation, Olfaction,
-                 Illusions",
-  journal =      biocyb,
-  volume =       "23",
-  pages =        "187--202",
-  year =         "1976",
-}
-
-@Article{Grossberg80,
-  author =       "S. Grossberg",
-  title =        "How Does the Brain Build a Cognitive Code?",
-  journal =      psyrev,
-  volume =       "87",
-  year =         "1980",
-}
-
-@Book{Grossberg87a,
-  author =       "S. Grossberg",
-  title =        "The Adaptive Brain",
-  volume =       "1--2",
-  publisher =    "Elsevier",
-  address =      "Amsterdam",
-  year =         "1987",
-}
-
-@Article{Grossberg87b,
-  author =       "S. Grossberg",
-  title =        "Competitive Learning: From Interactive Activation to
-                 Adaptive Resonance",
-  journal =      cogsci,
-  volume =       "11",
-  pages =        "23--63",
-  year =         "1987",
-}
-
-@inproceedings{Grosse-2007,
- author = {Roger Grosse and Rajat Raina and Helen Kwong and Andrew Y. Ng},
- title = {Shift-Invariant Sparse Coding for Audio Classification}, 
- booktitle = UAI07,
- year = 2007,
-}
-
-@InProceedings{Grossman-nips89,
-  author =       "T. Grossman R. Meir and E. Domany",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Learning by choice of internal representation",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "73--80",
-  year =         "1989",
-}
-
-@Article{Grossman89,
-  author =       "T. Grossman and R. Meir and E. Domany",
-  title =        "Learning by Choice of Internal Representations",
-  journal =      cs,
-  volume =       "2",
-  pages =        "555--575",
-  year =         "1989",
-}
-
-@InProceedings{Grossman90,
-  author =       "T. Grossman",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "The {CHIR} Algorithm for Feed Forward Networks with
-                 Binary Weights",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "516--523",
-  year =         "1990",
-}
-
-@Article{Guillery2005,
-  author =       "R. W. Guillery",
-  title =        "Is postnatal neocortical maturation hierarchical?",
-  journal =      "Trends in Neuroscience",
-  volume =       "28",
-  number =       "10",
-  pages =        "512--517",
-  month =        oct,
-  year =         "2005",
-}
-
-@InCollection{Gull88,
-  author =       "S. F. Gull",
-  editor =       "G. Erickson and C. Smith",
-  booktitle =    "Maximum Entropy and {Bayesian} Methods in Science and
-                 Engineering",
-  title =        "{Bayesian} inductive inference and maximum entropy",
-  volume =       "1",
-  publisher =    "Kluwer",
-  address =      "Dordrecht",
-  pages =        "53--74",
-  year =         "1988",
-}
-
-@Article{gullapalli:nn:1990,
-  author =       "V. Gullapalli",
-  title =        "A Stochastic Reinforcement Learning Algorithm for
-                 Learning Real-Valued Functions",
-  journal =      nn,
-  volume =       "3",
-  pages =        "671--692",
-  year =         "1990",
-}
-
-@Article{Gunn+Kandola01,
-  author =       "S. R. Gunn and J. Kandola",
-  title =        "Structural Modelling with Sparse Kernels",
-  journal =      "Machine Learning",
-  volume =       "special issue on New Methods for Model Combination and
-                 Model Selection",
-  year =         "2001",
-  note =         "to appear",
-}
-
-@inproceedings{Guo+Schuurmans-2007,
-author = "Guo, Y. and Schuurmans, D.",
-title = "Convex relaxations of latent variable training",
-editor =    NIPS20ed,
-booktitle = NIPS20,
-year = 2007,
-}
-
-@inproceedings{guoschuurmans07b,
-author = "Guo, Y. and Schuurmans, D.",
-title = "Discriminative batch mode active learning",
-editor =    NIPS20ed,
-booktitle = NIPS20,
-year = 2007,
-}
-
-@inproceedings{Guo+Schuurmans-2008,
-author = "Guo, Y. and Schuurmans, D.",
-title = "Efficient global optimization for exponential family {PCA} and 
-low-rank matrix factorization",
-booktitle = "Proceedings of the Forty-sixth Annual Allerton Conference on
-Communication, Control, and Computing (Allerton)",
-year = 2008,
-}
-
-@Article{Gutfreund88a,
-  author =       "H. Gutfreund",
-  title =        "Neural Networks with Hierarchically Correlated
-                 Patterns",
-  journal =      prA,
-  volume =       "37",
-  pages =        "570--577",
-  year =         "1988",
-}
-
-@Article{Gutfreund88b,
-  author =       "H. Gutfreund and M. M\'ezard",
-  title =        "Processing of Temporal Sequences in Neural Networks",
-  journal =      prl,
-  volume =       "61",
-  pages =        "235--238",
-  year =         "1988",
-}
-
-@InProceedings{Gutzmann87,
-  author =       "K. Gutzmann",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Combinatorial Optimization Using a Continuous State
-                 {Boltzmann} Machine",
-  volume =       "3",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "721--734",
-  year =         "1987",
-}
-
-@Article{guyon-91,
-  author =       "I. Guyon and P. Albrecht and Y. {Le Cun} and J. S.
-                 Denker and W. Hubbard",
-  title =        "design of a neural network character recognizer for a
-                 touch termin al",
-  journal =      "Pattern Recognition",
-  volume =       "24",
-  number =       "2",
-  pages =        "105--119",
-  year =         "1991",
-}
-
-@InProceedings{Guyon92,
-  author =       "I. Guyon and V. Vapnik and B. Boser and L. Bottou and
-                 S. A. Solla",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Structural Risk Minimization for Character
-                 Recognition",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "471--479",
-  year =         "1992",
-}
-
-@InCollection{Guyon92b,
-  author =       "I. Guyon",
-  editor =       "S. Impedovo",
-  booktitle =    "From Pixels to Features III",
-  title =        "Writer independent and writer adaptive neural network
-                 for on-line character recognition",
-  publisher =    "Elsevier",
-  address =      "Amsterdam",
-  pages =        "493--506",
-  year =         "1992",
-}
-
-@InProceedings{Guyon93,
-  author =       "I. Guyon and B. Boser and V. Vapnik",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Automatic Capacity Tuning of Very Large {VC}-dimension
-                 Classifiers",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "147--155",
-  year =         "1993",
-}
-
-@InProceedings{Guyon95,
-  author =       "I. Guyon and F. Pereira",
-  booktitle =    ICDAR95,
-  title =        "Design of a linguistic postprocessor using variable
-                 memory length {Markov} models",
-  publisher =    "IEEE Computer Society Press",
-  address =      "Montreal, Canada",
-  pages =        "454--457",
-  month =        aug,
-  year =         "1995",
-}
-
-@InCollection{Guyon96,
-  author =       "I. Guyon and M. Schenkel and J. Denker",
-  editor =       "P. S. P. Wang and H. Bunke",
-  booktitle =    "Handbook on Optical Character Recognition and Document
-                 Image Analysis",
-  title =        "Overview and synthesis of on-line cursive handwriting
-                 recognition techniques",
-  publisher =    "World Scientific",
-  year =         "1996",
-}
-
-@article{Guyon+Elisseeff-2003,
-    address = {Cambridge, MA},
-    author = {Guyon, Isabelle   and Elisseeff, Andre},
-    issn = {1533-7928},
-    journal = jmlr,
-    pages = {1157--1182},
-    publisher = {MIT Press},
-    title = {An introduction to variable and feature selection},
-    volume = {3},
-    year = {2003}
-}
-    %url = {http://portal.acm.org/citation.cfm?id=944968},
-
-@book{Guyon+al-2006,
-        editor = "Isabelle Guyon and Steve Gunn and Masoud Nikravesh and Lofti Zadeh",
-        title =    "Feature Extraction, Foundations and Applications",
-        publisher =    "Springer",
-        year =         "2006",
-}
-
-
-@Article{Gyorgyi90a,
-  author =       "G. Gy{\"o}rgyi",
-  title =        "Inference of a Rule by a Neural Network with Thermal
-                 Noise",
-  journal =      prl,
-  volume =       "64",
-  pages =        "2957--2960",
-  year =         "1990",
-}
-
-@InCollection{Gyorgyi90b,
-  author =       "G. Gyorgyi and N. Tishby",
-  editor =       "W. K. Theumann and R. Koeberle",
-  booktitle =    "Neural Networks and Spin Glasses",
-  title =        "Statistical Theory of Learning a Rule",
-  publisher =    "World Scientific",
-  address =      "Singapore",
-  year =         "1990",
-}
-
-@InProceedings{ha93,
-  author =       "J. Y. Ha and S. C. Oh and J. H. Kim and Y. B. Kwon",
-  booktitle =    "Third International Workshop on Frontiers in
-                 Handwriting Recognition",
-  title =        "Unconstrained handwritten word recognition with
-                 interconnected hidden {Markov} models",
-  publisher =    "IAPR",
-  address =      "Buffalo",
-  pages =        "455--460",
-  month =        may,
-  year =         "1993",
-}
-
-@Article{haasdonk2002tdk,
-  author =       "B. Haasdonk and D. Keysers",
-  title =        "{Tangent distance kernels for support vector
-                 machines}",
-  journal =      "Proc. of the 16th ICPR",
-  volume =       "2",
-  pages =        "864--868",
-  year =         "2002",
-}
-
-@inproceedings {hadsell-chopra-lecun-06,
-original = "orig/hadsell-chopra-lecun-06.pdf",
-author = "Hadsell, Raia and Chopra, Sumit and {LeCun}, Yann",
-title = "Dimensionality Reduction by Learning an Invariant Mapping",
-booktitle = cvpr06,
-publisher = "IEEE Press",
-pages = "1735--1742",
-year = 2006
-}
-
-@inproceedings {hadsell-chopra-lecun-06-small,
-original = "orig/hadsell-chopra-lecun-06.pdf",
-author = "Hadsell, Raia and Chopra, Sumit and {LeCun}, Yann",
-title = "Dimensionality Reduction by Learning an Invariant Mapping",
-booktitle = "CVPR'2006",
-publisher = "IEEE Press",
-year = 2006
-}
-
-@inproceedings{hadsell-iros-08,
- original = "orig/hadsell-iros-08.pdf",
- author = "Hadsell, Raia and Erkan, Ayse and Sermanet, Pierre and Scoffier, Marco and Muller, Urs and {LeCun}, Yann",
- title = "Deep Belief Net Learning in a Long-Range Vision System for Autonomous Off-Road Driving",
- booktitle = "Proc. Intelligent Robots and Systems (IROS'08)",
- pages = "628--633",
- year = "2008",
-}
- %url = "http://www.cs.nyu.edu/~raia/docs/iros08-farod.pdf",
-
-@TechReport{Haffner+96,
-  author =       "P. Haffner and L. Bottou and J. Bromley and C. J. C.
-                 Burges and T. Cauble and Y. {Le Cun} and C. Nohl and C.
-                 Stanton and C. Stenard and P. Vincent",
-  title =        "the {HCAR50} check amount reading system",
-  number =       "Forthcoming publication",
-  institution =  "Lucent Technologies, Bell Labs Innovation",
-  address =      "Holmdel, New-Jersey",
-  year =         "1996",
-}
-
-@InProceedings{Haffner89,
-  author =       "P. Haffner and A. Waibel and K. Shikano",
-  booktitle =    "Proceedings of Eurospeech'89",
-  title =        "Fast back-propagation learning methods for large
-                 phonemic neural networks",
-  year =         "1989",
-}
-
-@InProceedings{Haffner91,
-  author =       "P. Haffner and M. Franzini and A. Waibel",
-  booktitle =    icassp,
-  title =        "Integrating Time Alignment and Neural Networks for
-                 High Performance Continuous Speech Recognition",
-  address =      "Toronto",
-  pages =        "105--108",
-  year =         "1991",
-}
-
-@Book{HAJ90,
-  author =       "X. D. Huang and Y. Ariki and M. Jack",
-  title =        "Hidden Markov Models for Speech Recognition",
-  publisher =    "University Press",
-  address =      "Edinburgh",
-  year =         "1990",
-}
-
-@inproceedings{HagiwaraK2000,
- title = {Regularization Learning and Early Stopping in Linear Networks},
- author = {Hagiwara, Katsuyuki and Kuno, Kazuhiro},
- booktitle = ijcnn,
- year = {2000},
- isbn = {0-7695-0619-4},
- pages = {4511},
- publisher = {IEEE Computer Society},
- address = {Washington, DC, USA},
- }
-
-@TechReport{Ham2003,
-  author =       "J. Ham and D. D. Lee and S. Mika and B.
-                 Sch{\"o}lkopf",
-  title =        "A kernel view of the dimensionality reduction of
-                 manifolds",
-  number =       "TR-110",
-  institution =  "Max Planck Institute for Biological Cybernetics",
-  address =      "Germany",
-  year =         "2003",
-}
-
-@Article{Hamilton88,
-  author =       "J. D. Hamilton",
-  title =        "Rational-Expectations Econometric Analysis of Changes
-                 in Regime",
-  journal =      "Journal of Economic Dynamics and Control",
-  volume =       "12",
-  pages =        "385--423",
-  year =         "1988",
-}
-
-@Article{hamilton89,
-  author =       "J. D. Hamilton",
-  title =        "A new approach to the economic analysis of
-                 non-stationary time series and the business cycle",
-  journal =      "Econometrica",
-  volume =       "57",
-  number =       "2",
-  pages =        "357--384",
-  month =        mar,
-  year =         "1989",
-}
-
-@Article{Hamilton90,
-  author =       "J. D. Hamilton",
-  title =        "Analysis of time series subject to changes in regime",
-  journal =      "Journal of Econometrics",
-  volume =       "45",
-  pages =        "39--70",
-  year =         "1990",
-}
-
-@InCollection{Hamilton93,
-  author =       "J. D. Hamilton",
-  editor =       "R. Engle and D. {McFadden}",
-  booktitle =    "Handbook of Econometrics",
-  title =        "State-Space Models",
-  publisher =    "North Holland, New York",
-  year =         "1993",
-}
-
-@Article{Hamilton94,
-  author =       "J. D. Hamilton and R. Susmel",
-  title =        "Autoregressive conditional heteroskedasticity and
-                 changes in regime",
-  journal =      "Journal of Econometrics",
-  volume =       "64",
-  number =       "1-2",
-  pages =        "307--33",
-  year =         "1994",
-}
-
-@Article{Hamilton96,
-  author =       "J. D. Hamilton",
-  title =        "Specification testing in Markov-switching time-series
-                 models",
-  journal =      "Journal of Econometrics",
-  volume =       "70",
-  pages =        "127--157",
-  year =         "1996",
-}
-
-@misc{Hammersley+Clifford-1971,
- author = {John M. Hammersley and Peter Clifford}, 
- year = 1971, 
- title = {Markov field on finite graphs and lattices},
- howpublished = {Unpublished manuscript}
-}
-
-@InProceedings{HammondSimoncelli07,
-  author =       "David K. Hammond and Eero P. Simoncelli",
-  booktitle =    ICIP07,
-  title =        "A Machine Learning Framework for Adaptive Combination
-                 of Signal Denoising Methods",
-  volume =       "6",
-  pages =        "29--32",
-  year =         "2007",
-}
-
-@Article{hampshire90,
-  author =       "John B. Hampshire and Alexander H. Waibel",
-  title =        "A Novel Objective Function for Improved Phoneme
-                 Recognition Using Time-Delay Neural Networks",
-  journal =      "IEEE Transactions of Neural Networks",
-  volume =       "1",
-  number =       "2",
-  pages =        "216--228",
-  month =        jun,
-  year =         "1990",
-}
-
-@InProceedings{HAMPSHIRE92A,
-  author =       "J. B. Hampshire and B. V. K. Vijaya Kumar",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Shooting Craps in Search of an Optimal Strategy for
-                 Training Connectionist Pattern Classifiers",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "1125--1132",
-  year =         "1992",
-}
-
-@InProceedings{Han96,
-  author =       "H-H. Han and H-C. Jung and Y-R. Lee and S-C. Jeong",
-  booktitle =    nipc-hmit96,
-  title =        "Application of Neural Network for {PWR} Steam
-                 Generator Water Level Control at Low Power Operation",
-  volume =       "1",
-  publisher =    ans,
-  pages =        "49--52",
-  year =         "1996",
-}
-
-@InProceedings{Hanson89,
-  author =       "S. J. Hanson and L. Pratt",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "A Comparison of Different Biases for Minimal Network
-                 Construction with Back-Propagation",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "177--185",
-  year =         "1989",
-}
-
-@Book{Hardle2004,
-  author =       "Wolfgang H{\"a}rdle and Marlene M{\"u}ller and Stefan Sperlich and Axel
-                 Werwatz",
-  title =        "Nonparametric and Semiparametric Models",
-  publisher =    "Springer",
-  address =      "http://www.xplore-stat.de/ebooks/ebooks.html",
-  year =         "2004",
-}
-
-@article{Hardoon+al-2004,
-    address = {Cambridge, MA, USA},
-    author = {Hardoon, David  R.  and Szedmak, Sandor  R.  and Shawe-Taylor, John  R. },
-    doi = {10.1162/0899766042321814},
-    issn = {0899-7667},
-    journal = {Neural Computation},
-    month = {December},
-    number = {12},
-    pages = {2639--2664},
-    publisher = {MIT Press},
-    title = {Canonical Correlation Analysis: An Overview with Application to Learning Methods},
-    url = {http://portal.acm.org/citation.cfm?id=1119696.1119703},
-    volume = {16},
-    year = {2004}
-}
-
-@InProceedings{HardoonD2007,
-  author =       "David R. Hardoon and John Shawe-Taylor and Antti
-                 Ajanki and Kai Puolamäki and Samuel Kaski",
-  booktitle =    "Proceedings of AISTATS 2007",
-  title =        "Information Retrieval by Inferring Implicit Queries
-                 from Eye Movements",
-  year =         "2007",
-}
-
-@InProceedings{Harmeling02,
-  author =       "S. Harmeling and A. Ziehe and M. Kawanabe and K.-R.
-                 M{\"u}ller",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Kernel Feature Spaces and Nonlinear Blind Souce
-                 Separation",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-  original =     "orig/AA34.ps",
-}
-
-@InProceedings{Harp90,
-  author =       "S. A. Harp and T. Samad and A. Guha",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Designing Application-Specific Neural Networks Using
-                 the Genetic Algorithm",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "447--454",
-  year =         "1990",
-}
-
-@Article{Hartman90,
-  author =       "E. J. Hartman and J. D. Keeler and J. M. Kowalski",
-  title =        "Layered Neural Networks with {G}aussian Hidden Units As
-                 Universal Approximations",
-  journal =      nc,
-  volume =       "2",
-  pages =        "210--215",
-  year =         "1990",
-}
-
-@Article{Haruno01,
-  author =       "M. Haruno and DM. Wolpert and M. Kawato",
-  title =        "{MOSAIC} model for sensorimotor learning and control",
-  journal =      "Neural Computation",
-  volume =       "13",
-  number =       "10",
-  pages =        "2201--2220",
-  year =         "2001",
-}
-
-@InProceedings{Hassibi-nips93,
-  author =       "B. Hassibi and D. G. Stork",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Second Order Derivatives for Network Pruning: Optimal
-                 Brain Surgeon",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "164--171",
-  year =         "1993",
-}
-
-@InProceedings{Hastad86,
-  author =       "Johan H{\aa}stad",
-  booktitle =    "Proceedings of the 18th annual ACM Symposium on Theory
-                 of Computing",
-  title =        "Almost optimal lower bounds for small depth circuits",
-  publisher =    "ACM Press",
-  address =      "Berkeley, California",
-  pages =        "6--20",
-  year =         "1986",
-}
-
-@Book{Hastad87,
-  author =       "Johan T. H{\aa}stad",
-  title =        "Computational Limitations for Small Depth Circuits",
-  publisher =    "{MIT} Press",
-  year =         "1987",
-}
-
-@Article{Hastad91,
-  author =       "Johan H{\aa}stad and Mikael Goldmann",
-  title =        "On the power of small-depth threshold circuits",
-  journal =      "Computational Complexity",
-  volume =       "1",
-  pages =        "113--129",
-  year =         "1991",
-}
-
-@Article{Hastie-Stuetzle-1989,
-  author =       "T. Hastie and W. Stuetzle",
-  title =        "Principal Curves",
-  journal =      "Journal of the American Statistical Association",
-  volume =       "84",
-  pages =        "502--516",
-  year =         "1989",
-}
-
-@Book{Hastie2001,
-  author =       "T. Hastie and R. Tibshirani and J. Friedman",
-  title =        "The elements of statistical learning: data mining,
-                 inference and prediction",
-  publisher =    "Springer Verlag",
-  year =         "2001",
-  series =       "Springer Series in Statistics",
-  annote =       "ISBN: 0387952845",
-}
-
-@Article{Hastie2004,
-  author =       "Trevor Hastie and Saharon Rosset and Robert Tibshirani
-                 and Ji Zhu",
-  title =        "The entire regularization path for the support vector
-                 machine",
-  journal =      jmlr,
-  volume =       "5",
-  pages =        "1391--1415",
-  year =         "2004",
-}
-
-@InProceedings{hastie96discriminant,
-  author =       "T. Hastie and R. Tibshirani",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Discriminant Adaptive Nearest Neighbor Classification
-                 and Regression",
-  volume =       "8",
-  publisher =    "{MIT} Press",
-  pages =        "409--415",
-  year =         "1996",
-  URL =          "citeseer.nj.nec.com/hastie94discriminant.html",
-}
-
-@Article{Hathaway85,
-  author =       "R. J. Hathaway",
-  title =        "A constrained formulation of Maximum-Likelihood
-                 estimation for normal mixture distributions",
-  journal =      "The Annals of Statistics",
-  volume =       "13",
-  number =       "2",
-  year =         "1985",
-}
-
-@article{hausser:2003,
-    author = {Michael Ha{\"u}sser and Bartlett Mel},
-    title = {Dendrites: Bug or Feature?},
-    journal = {Current Opinion in Neurobiology},
-    volume = {13},
-    year = {2003},
-    pages = {372-383},
-}
-
-@InProceedings{Haussler89,
-  author =       "D. Haussler",
-  booktitle =    "Proc. of the 30th Annual Symposium on the Foundations
-                 of Computer Science",
-  title =        "Generalizing the {PAC} model: sample size bounds from
-                 metric dimension-based uniform convergence results",
-  publisher =    "IEEE",
-  year =         "1989",
-}
-
-@InProceedings{haussler95,
-  author =       "D. Haussler and J. Kivinen and M. K. Warmuth",
-  booktitle =    "Computational Learning Theory, 2nd European
-                 Conference, EuroCOLT'95",
-  title =        "Sequential prediction of individual sequences under
-                 general loss functions",
-  publisher =    "Springer",
-  pages =        "69--83",
-  year =         "1995",
-}
-
-@book{hay01nnn,
-    author = {Haykin, Simon},
-    edition = {2},
-    howpublished = {Hardcover},
-    isbn = {0132733501},
-    keywords = {network, neural},
-    month = {July},
-    posted-at = {2009-07-04 21:37:33},
-    priority = {2},
-    publisher = {Prentice Hall},
-    title = {Neural Networks: A Comprehensive Foundation (2nd Edition)},
-    url = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20&path=ASIN/0132733501},
-    year = {1998}
-}
-
-
-@TechReport{He+Niyogi-2002,
-  author =       "X. He and P. Niyogi",
-  title =        "Locality Preserving Projections ({LPP})",
-  number =       "TR-2002-09",
-  institution =  "University of Chicago, Computer Science",
-  year =         "2002",
-}
-
-@incollection{He+Niyogi-2004,
-    author = "Xiaofei He and Partha Niyogi",
-    title = "Locality Preserving Projections",
-    editor = NIPS16ed,
-    booktitle = NIPS16,
-    publisher = "MIT Press",
-    address = "Cambridge, MA",
-    year = "2004",
-}
-
-@Book{Hebb49,
-  author =       "D. O. Hebb",
-  title =        "The Organization of Behavior",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1949",
-}
-
-@InProceedings{Hecht-Nielsen87a,
-  author =       "R. Hecht-Nielsen",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Combinatorial Hypercompression",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "455--461",
-  year =         "1987",
-}
-
-@Article{Hecht-Nielsen87b,
-  author =       "R. Hecht-Nielsen",
-  title =        "Counterpropagation Networks",
-  journal =      applopt,
-  volume =       "26",
-  pages =        "4979--4984",
-  year =         "1987",
-}
-
-@Article{Hecht-Nielsen88,
-  author =       "R. Hecht-Nielsen",
-  title =        "Applications of Counterpropagation Networks",
-  journal =      nn,
-  volume =       "1",
-  pages =        "131--139",
-  year =         "1988",
-}
-
-@InProceedings{Hecht-Nielsen89,
-  author =       "R. Hecht-Nielsen",
-  booktitle =    ijcnn,
-  title =        "Theory of the Backpropagation Neural Network",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "593--605",
-  year =         "1989",
-}
-
-@Article{Hecht-Nielsen-1995,
-  author =       "R. Hecht-Nielsen",
-  title =        "Replicator neural networks for universal optimal source coding",
-  journal =      "Science",
-  volume =       "269",
-  pages =        "1860-1863",
-  year =         "1995",
-}
-
-@TechReport{Heckerman96,
-  author =       "D. Heckerman",
-  title =        "A tutorial on learning with {Bayesian} networks",
-  number =       "TR-95-06",
-  institution =  "Microsoft Research",
-  address =      "ftp://ftp.research.microsoft.com/pub/Tech-Reports/Winter94-95/TR-95-06.PS",
-  month =        jan,
-  year =         "1996",
-}
-
-@article{HeckermanD2000,
-    author = {David Heckerman and David Maxwell Chickering and Christopher Meek and Robert Rounthwaite and Carl Kadie},
-    title = {Dependency networks for inference, collaborative filtering, and data visualization},
-    journal = jmlr,
-    year = {2000},
-    volume = {1},
-    pages = {49--75}
-}
-
-@article{heeger:1992a,
-    author={David J. Heeger},
-    title ={Normalization of Cell Responses in Cat Striate Cortex},
-    journal ={Visual Neuroscience},
-    volume={9},
-    number={2},
-    pages={181-198},
-    year={1992},
-}
-
-@InProceedings{Hegde88,
-  author =       "S. U. Hegde and J. L. Sweet and W. B. Levy",
-  booktitle =    icnn,
-  title =        "Determination of Parameters in a Hopfield/Tank
-                 Computational Network",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "291--298",
-  year =         "1988",
-}
-
-@article{HedgeJ2000,
-	address = {Department of Anatomy and Neurobiology, Washington University School of Medicine, St. Louis, Missouri 63110, USA.},
-	author = {Jay Hegd\'{e} and David C. {Van Essen} },
-	citeulike-article-id = {465720},
-	issn = {1529-2401},
-	journal = {Journal of Neuroscience},
-	keywords = {contour, v2},
-	month = {March},
-	number = {5},
-	posted-at = {2006-01-15 12:57:15},
-	priority = {0},
-	title = {Selectivity for complex shapes in primate visual area V2},
-	volume = {20},
-	year = {2000}
-}
-	%url = {http://view.ncbi.nlm.nih.gov/pubmed/10684908},
-
-@inproceedings{Heitz+al:NIPS08a,
-  title = {Cascaded Classification Models: {C}ombining Models for Holistic Scene Understanding},
-  author = {G. Heitz and S. Gould and A. Saxena and D. Koller},
-  booktitle =    "Advances in Neural Information Processing Systems (NIPS 2008)",
-  year = 2008,
-}
-
-@InProceedings{HeldM1998,
-  author =       "Marcus Held and Joachim M. Buhmann",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "Unsupervised on-line learning of decision trees for
-                 hierarchical data analysis",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA, USA",
-  pages =        "514--520",
-  year =         "1998",
-  ISBN =         "0-262-10076-2",
-  location =     "Denver, Colorado, United States",
-}
-
-@InProceedings{herlocker99,
-  author =       "Jonathan L. Herlocker and Joseph A. Konstan and Al
-                 Borchers and John Riedl",
-  booktitle =    "SIGIR '99: Proceedings of the 22nd annual
-                 international ACM SIGIR conference on Research and
-                 development in information retrieval",
-  title =        "An algorithmic framework for performing collaborative
-                 filtering",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "230--237",
-  year =         "1999",
-  location =     "Berkeley, California, United States",
-}
-
-@InProceedings{Hermansky-genova91,
-  author =       "Hynek Hermansky and Nelson Morgan and Aruna Bayya and
-                 Phil Kohn",
-  booktitle =    "Proc. of Eurospeech 91",
-  title =        "Compensation for the Effect of the Communication
-                 Channel in Auditory-like Analysis of Speech
-                 ({RASTA}-{PLP})",
-  address =      "Genova (Italy)",
-  pages =        "1367--1371",
-  year =         "1991",
-}
-
-@TechReport{Hermansky-icsi91,
-  author =       "Hynek Hermansky and Nelson Morgan and Aruna Bayya and
-                 Phil Kohn",
-  title =        "{RASTA}-{PLP} Speech Analysis",
-  number =       "TR-91-069",
-  institution =  "International Computer Science Institute",
-  address =      "Berkeley, CA",
-  month =        dec,
-  year =         "1991",
-  OPTnote =      "Most speech parameter estimation techniques are easily
-                 influenced by the frequency response of the
-                 communication channel. We have developed a technique
-                 that is more robust to such steady-state spectral
-                 factors in speech. The approach is conceptually simple
-                 and computationally efficient. The new method is
-                 described, and experimental results are reported,
-                 showing a significant advantage for the proposed
-                 method.",
-}
-
-@Article{Hermansky-jasa90,
-  author =       "Hynek Hermansky",
-  title =        "Perceptual Linear Predictive ({PLP}) Analysis for
-                 Speech",
-  journal =      jasa,
-  year =         "1990",
-  OPTnote =      "",
-  OPTpages =     "1738--1752",
-}
-
-@Book{Hernandez-Lerma+Lasserre-2003,
-  author =       "On\'esimo Hern\'andez-Lerma and Jean Bernard
-                 Lasserre",
-  title =        "Markov Chains and Invariant Probabilities",
-  publisher =    "Birkh{\"a}user Verlag",
-  year =         "2003",
-}
-
-@InProceedings{Hertz86,
-  author =       "J. A. Hertz and G. Grinstein and S. Solla",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "Memory Networks with Asymmetric Bonds",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "212--218",
-  year =         "1986",
-}
-
-@InProceedings{Hertz87,
-  author =       "J. A. Hertz and G. Grinstein and S. Solla",
-  editor =       "J. L. van Hemmen and I. Morgenstern",
-  booktitle =    "Heidelberg Colloquium on Glassy Dynamics",
-  title =        "Irreversible Spin Glasses and Neural Networks",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Heidelberg 1986",
-  pages =        "538--546",
-  year =         "1987",
-}
-
-@Article{Hertz89a,
-  author =       "J. A. Hertz",
-  title =        "A Gauge Theory in Computational Vision: {A} Model for
-                 Outline Extraction",
-  journal =      pscrip,
-  volume =       "39",
-  pages =        "161--167",
-  year =         "1989",
-}
-
-@Article{Hertz89b,
-  author =       "J. A. Hertz and A. Krogh and G. I. Thorbergsson",
-  title =        "Phase Transitions in Simple Learning",
-  journal =      jpa,
-  volume =       "22",
-  pages =        "2133--2150",
-  year =         "1989",
-}
-
-@TechReport{Hertz90,
-  author =       "J. A. Hertz",
-  title =        "Statistical Dynamics of Learning",
-  type =         "Preprint",
-  number =       "90/34 S",
-  institution =  "Nordita",
-  address =      "Copenhagen, Denmark",
-  year =         "1990",
-}
-
-@Article{Herz89,
-  author =       "A. Herz and B. Sulzer and R. K{\"u}hn and J. L. van
-                 Hemmen",
-  title =        "Hebbian Learning Reconsidered: Representation of
-                 Static and Dynamic Objects in Associative Neural Nets",
-  journal =      biocyb,
-  volume =       "60",
-  pages =        "457--467",
-  year =         "1989",
-}
-
-@Article{Heskes-98,
-  author =       "T. Heskes",
-  title =        "Bias/variance decompositions for likelihood-based
-                 estimators",
-  journal =      "Neural Computation",
-  volume =       "10",
-  pages =        "1425--1433",
-  year =         "1998",
-}
-
-@Article{heskes00,
-  author =       "Tom Heskes",
-  title =        "On Natural Learning and Pruning in Multilayered
-                 Perceptrons",
-  journal =      "Neural Computation",
-  volume =       "12",
-  number =       "4",
-  pages =        "881--901",
-  year =         "2000",
-}
-
-@InProceedings{heskes98,
-  author =       "Tom Heskes",
-  booktitle =    "International Conference On Machine Learning",
-  title =        "Solving a huge number of similar tasks: a combination
-                 of multi-task learning and a hierarchical {Bayesian}
-                 approach",
-  year =         "1998",
-}
-
-@Article{Hestenes+Stiefel-1952,
-  author =       "Magnus R. Hestenes and Eduard Stiefel",
-  title =        "Methods of Conjugate Gradients for Solving Linear
-                 Systems",
-  journal =      "Journal of Research of National Bureau Standards",
-  volume =       "49",
-  number =       "6",
-  pages =        "409--436",
-  year =         "1952",
-}
-
-@Article{Hettich-93,
-  author =       "R. Hettich and K. O. Kortanek",
-  title =        "Semi-infinite programming: theory, methods, and
-                 applications",
-  journal =      "{SIAM} Review",
-  volume =       "35",
-  number =       "3",
-  pages =        "380--429",
-  year =         "1993",
-}
-
-@InProceedings{Hines96,
-  author =       "J. W. Hines",
-  booktitle =    nipc-hmit96,
-  title =        "A Logarithmic Neural Network Architecture for a {PRA}
-                 Approximation",
-  volume =       "1",
-  publisher =    ans,
-  pages =        "235--241",
-  year =         "1996",
-}
-
-@Article{HinOsiWel2006,
-  author =       "Geoffrey E. Hinton and Simon Osindero and Max Welling
-                 and {Yee Whye} Teh",
-  title =        "Unsupervised Discovery of Non-Linear Structure using
-                 Contrastive Backpropagation",
-  journal =      "Cognitive Science",
-  volume =       "30",
-  number =       "4",
-  year =         "2006",
-}
-
-@Article{Hinton+Ghahramani-97,
-  author =       "G. E. Hinton and Z. Ghahramani",
-  title =        "Generative models for discovering sparse distributed
-                 representations",
-  journal =      "Philosophical Transactions of the Royal Society of
-                 London",
-  volume =       "B",
-  number =       "352",
-  pages =        "1177--1190",
-  year =         "1997",
-}
-
-@InCollection{Hinton-bo86,
-  author =       "G. E. Hinton and T. J. Sejnowski",
-  editor =       "D. E. Rumelhart and J. L. McClelland",
-  booktitle =    "Parallel Distributed Processing: Explorations in the
-                 Microstructure of Cognition. Volume 1: Foundations",
-  title =        "Learning and relearning in {Boltzmann} machines",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "282--317",
-  year =         "1986",
-}
-
-@InProceedings{Hinton-ICA-2001,
-  author =       "G. E. Hinton and M. Welling and Y. W. Teh and S.
-                 Osindero",
-  booktitle =    "Proceedings of 3rd International Conference on Independent Component Analysis and Blind Signal Separation (ICA'01)",
-  title =        "A New View of {ICA}",
-  address =      "San Diego, CA",
-  pages =        "746--751",
-  year =         "2001",
-}
-
-@InProceedings{Hinton-nips95,
-  author =       "G. E. Hinton and M. Revow and P. Dayan",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        "Recognizing handwritten digits using mixtures of
-                 linear models",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "1015--1022",
-  year =         "1995",
-}
-
-@TechReport{Hinton-PoE-2000,
-  author =       "Geoffrey E. Hinton",
-  title =        "Training Products of Experts by Minimizing Contrastive
-                 Divergence",
-  number =       "GCNU TR 2000-004",
-  institution =  "Gatsby Unit, University College London",
-  year =         "2000",
-}
-
-@Article{Hinton-Science2006,
-  author =       "Geoffrey E. Hinton and Ruslan Salakhutdinov",
-  title =        "Reducing the dimensionality of data with neural
-                 networks",
-  journal =      "Science",
-  volume =       "313",
-  number =       "5786",
-  pages =        "504--507",
-  month =        jul,
-  year =         "2006",
-}
-
-%I deprecate the following one as this is a duplicate of the preceding one!
-@Article{Hinton+Salakhutdinov-2006,
-  author =       "Geoffrey E. Hinton and Ruslan {Salakhutdinov}",
-  title =        "{Reducing the Dimensionality of Data with Neural
-                 Networks}",
-  journal =      "Science",
-  volume =       "313",
-  pages =        "504--507",
-  month =        jul,
-  year =         "2006",
-}
-
-
-@Article{Hinton06,
-  author =       "Goeffrey E. Hinton and Simon Osindero and {Yee Whye} Teh",
-  title =        "A fast learning algorithm for deep belief nets",
-  journal =      "Neural Computation",
-  volume =       "18",
-  pages =        "1527--1554",
-  year =         "2006",
-
-}
-
-@Article{Hinton06-small,
-  author =       "G. E. Hinton and S. Osindero and Y.-W. Teh",
-  title =        "A fast learning algorithm for deep belief nets",
-  journal =      "Neural Computation",
-  volume =       "18",
-  pages =        "1527--1554",
-  year =         "2006",
-
-}
-
-@InProceedings{hinton1994amd,
-  author =       "Geoffrey E. Hinton and R. S. Zemel",
-  title =        "Autoencoders, minimum description length, and
-                 Helmholtz free energy",
-  booktitle =    NIPS6,
-  editor =       NIPS6ed,
-  publisher =    "Morgan Kaufmann Publishers, Inc.",
-  pages =        "3--10",
-  year =         "1994",
-}
-
-@Article{Hinton2002,
-  author =       "Geoffrey E. Hinton",
-  title =        "Training products of experts by minimizing contrastive
-                 divergence",
-  journal =      "Neural Computation",
-  volume =       "14",
-  pages =        "1771--1800",
-  year =         "2002",
-}
-
-@InProceedings{Hinton83,
-  author =       "G. E. Hinton and T. J. Sejnowski",
-  booktitle =    cvpr83,
-  title =        "Optimal Perceptual Inference",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1983",
-  pages =        "448--453",
-  year =         "1983",
-}
-
-@TechReport{Hinton84,
-  author =       "G. E. Hinton and T. J. Sejnowski and D. H. Ackley",
-  title =        "{Boltzmann} machines: Constraint satisfaction networks
-                 that learn",
-  number =       "TR-CMU-CS-84-119",
-  institution =  "Carnegie-Mellon University, Dept. of Computer
-                 Science",
-  year =         "1984",
-}
-
-@InCollection{Hinton86a,
-  author =       "G. E. Hinton and T. J. Sejnowski",
-  editor =       "D. E. Rumelhart and J. L. McClelland",
-  booktitle =    pdp,
-  title =        "Learning and Relearning in {Boltzmann} Machines",
-  chapter =      "7",
-  volume =       "1",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  pages =        "282--317",
-  year =         "1986",
-}
-
-@InProceedings{Hinton86b,
-  author =       "Geoffrey E. Hinton",
-  booktitle =    "Proceedings of the Eighth Annual Conference of the
-                 Cognitive Science Society",
-  title =        "Learning Distributed Representations of Concepts",
-  publisher =    "Lawrence Erlbaum, Hillsdale",
-  address =      "Amherst 1986",
-  pages =        "1--12",
-  year =         "1986",
-}
-
-@InProceedings{Hinton86b-small,
-  author =       "Geoffrey E. Hinton",
-  booktitle =    "Proc. 8th Annual Conf. Cog. Sc. Society",
-  title =        "Learning Distributed Representations of Concepts",
-  pages =        "1--12",
-  year =         "1986",
-}
-
-@InProceedings{Hinton87,
-  author =       "Geoffrey E. Hinton",
-  editor =       "J. W. {de Bakker} and A. J. Nijman and P. C.
-                 Treleaven",
-  booktitle =    "Proceedings of {PARLE} Conference on Parallel
-                 Architectures and Languages Europe",
-  title =        "Learning translation invariant in massively parallel
-                 networks",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  pages =        "1--13",
-  year =         "1987",
-}
-
-@Article{Hinton89,
-  author =       "Geoffrey E. Hinton",
-  title =        "Deterministic {Boltzmann} Learning Performs Steepest
-                 Descent in Weight Space",
-  journal =      nc,
-  volume =       "1",
-  pages =        "143--150",
-  year =         "1989",
-}
-
-@Article{Hinton89b,
-  author =       "Geoffrey E. Hinton",
-  title =        "Connectionist learning procedures",
-  journal =      "Artificial Intelligence",
-  volume =       "40",
-  pages =        "185--234",
-  year =         "1989",
-}
-
-@Article{Hinton90,
-  author =       "G. E. Hinton and S. J. Nowlan",
-  title =        "The bootstrap Widrow-Hoff rule as a cluster-formation
-                 algorithm",
-  journal =      nc,
-  volume =       "2",
-  pages =        "355--362",
-  year =         "1990",
-}
-
-@InProceedings{Hinton92,
-  author =       "G. E. Hinton and C. K. I. Williams and M. D. Revow",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Adaptive elastic models for hand-printed character
-                 recognition",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "512--519",
-  year =         "1992",
-}
-
-@Misc{Hinton93,
-  author =       "Geoffrey E. Hinton",
-  title =        "Using the minimum description length principle to
-                 discover factorial codes",
-  howpublished = "Lecture given at the 1993 Connectionist Models Summer
-                 School",
-  year =         "1993",
-}
-
-@Article{Hinton95,
-  author =       "Geoffrey E. Hinton and Peter Dayan and Brendan J. Frey and Radford M.
-                 Neal",
-  title =        "The wake-sleep algorithm for unsupervised neural
-                 networks",
-  journal =      "Science",
-  volume =       "268",
-  pages =        "1558--1161",
-  year =         "1995",
-}
-
-@Article{hinton97modelling,
-  author =       "G. E. Hinton and P. Dayan and M. Revow",
-  title =        "Modelling the manifolds of images of handwritten
-                 digits",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "8",
-  pages =        "65--74",
-  year =         "1997",
-}
-
-@InProceedings{Hinton99,
-  author =       "Geoffrey E. Hinton",
-  booktitle =    "Proceedings of the Ninth International Conference on
-                 Artificial Neural Networks (ICANN)",
-  title =        "Products of Experts",
-  volume =       "1",
-  publisher =    "IEE",
-  address =      "Edinburgh, Scotland",
-  pages =        "1--6",
-  year =         "1999",
-}
-
-@InProceedings{HintonG2005,
-  author =       "Geoffrey E. Hinton and Simon Osindero and Kejie Bao",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "Learning Causally Linked Markov Random Fields",
-  publisher =    "Society for Artificial Intelligence and Statistics",
-  pages =        "128--135",
-  year =         "2005",
-}
-
-@InProceedings{HintonG2005-small,
-  author =       "Geoffrey E. Hinton and Simon Osindero and Kejie Bao",
-  booktitle =    "Proceedings of AISTATS 2005",
-  title =        "Learning Causally Linked Markov Random Fields",
-  year =         "2005",
-}
-
-@TechReport{HintonG2006,
-  author =       "Geoffrey E. Hinton",
-  title =        "To recognize shapes, first learn to generate images",
-  number =       "UTML TR 2006-003",
-  institution =  "University of Toronto",
-  year =         "2006",
-}
-
-@InCollection{HintonG2007,
-  author =       "Geoffrey E. Hinton",
-  editor =       "Paul Cisek and Trevor Drew and John Kalaska",
-  booktitle =    "Computational Neuroscience: Theoretical Insights into
-                 Brain Function",
-  title =        "To recognize shapes, first learn to generate images",
-  publisher =    "Elsevier",
-  year =         "2007",
-}
-
-@TechReport{Hinton-Boltzmann,
-  author =       "G. E. Hinton and T. J. Sejnowski and D. H. Ackley",
-  title =        "{Boltzmann} machines: Constraint satisfaction networks
-                 that learn",
-  number =       "TR-CMU-CS-84-119",
-  institution =  "Carnegie-Mellon University, Dept. of Computer
-                 Science",
-  year =         "1984",
-  OPTnote =      "",
-}
-
-@InProceedings{Hirayama-nips92,
-  author =       "M. Yirayama and E. Vatikiotis-Bateson and M. Kawato
-                 and M. I. Jordan",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Forward Dynamics Modeling of Speech Motor Control
-                 Using Physiological Data",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "191--198",
-  year =         "1992",
-  OPTnote =      "",
-}
-
-@Article{Hjort96,
-  author =       "N. L. Hjort and M. C. Jones",
-  title =        "Locally parametric nonparametric density estimation",
-  journal =      "Annals of Statistics",
-  volume =       "24",
-  number =       "4",
-  pages =        "1619--1647",
-  year =         "1996",
-}
-
-@InProceedings{Ho95,
-  author =       "Tin Kam Ho",
-  booktitle =    ICDAR95,
-  title =        "Random Decision Forest",
-  address =      "Montreal, Canada",
-  pages =        "278--282",
-  year =         "1995",
-}
-
-@Misc{Hochreiter91,
-  author =       "S. Hochreiter",
-  title =        "{ Untersuchungen zu dynamischen neuronalen Netzen.
-                 Diploma thesis, Institut f\"{u}r Informatik, Lehrstuhl
-                 Prof. Brauer, Technische Universit\"{a}t M\"{u}nchen}",
-  year =         "1991",
-  url =         "http://www7.informatik.tu-muenchen.de/~Ehochreit",
-}
-
-@Article{Hoerl+Kennard70,
-  author =       "A. Hoerl and R. Kennard",
-  title =        "Ridge regression: biased estimation for non-orthogonal
-                 problems",
-  journal =      "Technometrics",
-  volume =       "12",
-  pages =        "55--67",
-  year =         "1970",
-}
-
-@inproceedings{Hoff-2008,
- author = {H.D. Hoff},
- title = {Modeling homophily and stochastic equivalence in symmetric relational data},
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "657--664",
-  year =         "2008",
-}
-
-@InProceedings{Holger-icpr96,
-  author =       "H. Schwenk and M. Milgram",
-  booktitle =    icpr,
-  title =        "Constraint Tangent Distance For On-Line Character
-                 Recognition",
-  pages =        "520--524",
-  year =         "1996",
-}
-
-@InProceedings{Holger-nips96,
-  author =       "H. Schwenk and M. Milgram",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        "Transformation invariant autoassociation with
-                 application to handwritten character recognition",
-  publisher =    "MIT Press",
-  pages =        "991--998",
-  year =         "1995",
-}
-
-@Book{Holland75,
-  author =       "J. H. Holland",
-  key =          "Holland",
-  title =        "Adaptation in Natural and Artificial Systems",
-  publisher =    "University of Michigan Press",
-  year =         "1975",
-}
-
-@Article{Holley+Karplus89,
-  author =       "L. H. Holley and M. Karplus",
-  title =        "Protein secondary structure prediction with a neural
-                 network",
-  journal =      PNAS,
-  volume =       "86",
-  pages =        "152--156",
-  year =         "1989",
-}
-
-@InCollection{HolTre93,
-  author =       "J. Hollatz and V. Tresp",
-  editor =       "I. Aleksander and J. Taylor",
-  booktitle =    "Artificial Neural Networks II",
-  title =        "A rule-based network architecture",
-  publisher =    "Elsevier",
-  address =      "Amsterdam",
-  year =         "1992",
-}
-
-@TechReport{HolTreAhm92,
-  author =       "J. Hollatz and V. Tresp and S. Ahmad",
-  title =        "Network structuring and training using rule-based
-                 knowledge",
-  type =         "Technical Report",
-  institution =  "Siemens AG",
-  address =      "M{\"u}nchen, Germany",
-  year =         "1992",
-}
-
-@InProceedings{HolubA2005,
-  author =       "Alex Holub and Pietro Perona",
-  booktitle =    cvpr05,
-  title =        "A Discriminative Framework for Modelling Object
-                 Classes",
-  publisher =    "IEEE Computer Society",
-  address =      "Washington, DC, USA",
-  pages =        "664--671",
-  year =         "2005",
-  ISBN =         "0-7695-2372-2",
-  doi =          "http://dx.doi.org/10.1109/CVPR.2005.25",
-}
-
-@InCollection{HonglakL2009,
-  author =       "Honglak Lee and Roger Grosse and Rajesh Ranganath and Andrew Y. Ng",
-  booktitle =    ICML09,
-  editor =       ICML09ed,
-  publisher =    ICML09publ,
-  title =        "Convolutional deep belief networks for scalable unsupervised 
-		 		 learning of  hierarchical representations",
-  address =      "Montreal (Qc), Canada",
-  year =         "2009",
-}
-
-@InCollection{HonglakL2008,
-  author =       "Honglak Lee and Chaitanya Ekanadham and Andrew Ng",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Sparse deep belief net model for visual area {V2}",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "873--880",
-  year =         "2008",
-}
-
-@incollection{HonglakLNIPS2009,
- title = {Unsupervised feature learning for audio classification using convolutional deep belief networks},
- author = {Honglak Lee and Peter Pham and Yan Largman and Andrew Ng},
- booktitle = NIPS22,
- editor = NIPS22ed,
- pages = {1096--1104},
- year = {2009}
-}
-
-@Book{Hopcroft79,
-  author =       "J. E. Hopcroft and J. D. Ullman",
-  title =        "Introduction to Automata Theory, Languages, and
-                 Computation",
-  publisher =    "Addison-Wesley Publishing Company, Inc.",
-  address =      "Reading, MA",
-  year =         "1979",
-}
-
-@Article{Hopfield82,
-  author =       "John J. Hopfield",
-  title =        "Neural Networks and Physical Systems with Emergent
-                 Collective Computational Abilities",
-  journal =      PNAS,
-  volume =       "79",
-  year =         "1982",
-}
-
-@Article{Hopfield83,
-  author =       "J. J. Hopfield and D. I. Feinstein and R. G. Palmer",
-  title =        "``Unlearning'' Has a Stabilizing Effect in Collective
-                 Memories",
-  journal =      nature,
-  volume =       "304",
-  pages =        "158--159",
-  year =         "1983",
-}
-
-@Article{Hopfield84,
-  author =       "J. J. Hopfield",
-  title =        "Neurons with Graded Responses Have Collective
-                 Computational Properties Like Those of Two-State
-                 Neurons",
-  journal =      PNAS,
-  volume =       "81",
-  year =         "1984",
-}
-
-@Article{Hopfield85,
-  author =       "J. J. Hopfield and D. W. Tank",
-  title =        "``Neural'' Computation of Decisions in Optimization
-                 Problems",
-  journal =      biocyb,
-  volume =       "52",
-  pages =        "141--152",
-  year =         "1985",
-}
-
-@Article{Hopfield86,
-  author =       "J. J. Hopfield and D. W. Tank",
-  title =        "Computing with Neural Circuits: {A} Model",
-  journal =      science,
-  volume =       "233",
-  pages =        "625--633",
-  year =         "1986",
-}
-
-@Article{Hopfield87,
-  author =       "J. J. Hopfield",
-  title =        "Learning Algorithms and Probability Distributions in
-                 Feed-Forward and Feed-Back Networks",
-  journal =      PNAS,
-  volume =       "84",
-  pages =        "8429--8433",
-  year =         "1987",
-}
-
-@InCollection{Hopfield89,
-  author =       "J. J. Hopfield and D. W. Tank",
-  editor =       "J. H. Byrne and W. O. Berry",
-  booktitle =    "Neural Models of Plasticity",
-  title =        "Neural Architecture and Biophysics for Sequence
-                 Recognition",
-  publisher =    "Academic Press",
-  address =      "San Diego",
-  pages =        "363--377",
-  year =         "1989",
-}
-
-@Article{Hornik89,
-  author =       "Kurt Hornik and Maxwell Stinchcombe and Halbert White",
-  title =        "Multilayer Feedforward Networks Are Universal
-                 Approximators",
-  journal =      nn,
-  volume =       "2",
-  pages =        "359--366",
-  year =         "1989",
-}
-
-@Article{Hotelling1933,
-  author =       "H. Hotelling",
-  title =        "Analysis of a Complex of Statistical Variables into
-                 Principal Components",
-  journal =      "Journal of Educational Psychology",
-  volume =       "24",
-  pages =        "417--441, 498--520",
-  year =         "1933",
-}
-
-@article{Hotelling-1936,
-    author = {H. Hotelling},
-    title = {Relations between two sets of variates},
-    journal = {Biometrika},
-    volume = 28,
-    pages = {321--377},
-    year = 1936,
-}
-
-@TechReport{Houde91,
-  author =       "J. F. Houde",
-  title =        "Recursive estimation of articulatory control",
-  type =         "Computational Cognitive Science",
-  number =       "TR",
-  institution =  "MIT",
-  address =      "Cambridge, MA",
-  year =         "1991",
-}
-
-@InProceedings{Howlett+Lawrence-1995a,
-  author =       "R. J. Howlett and D. H. Lawrence",
-  booktitle =    "World Transputer Congress~'95",
-  title =        "The Class-Distributed Neural Network",
-  address =      "Harrogate, UK",
-  year =         "1995",
-}
-
-@InProceedings{Howlett+Lawrence-1995b,
-  author =       "R. J. Howlett and D. H. Lawrence",
-  booktitle =    "Proceedings of the IEEE International Conference on
-                 Neural Networks",
-  title =        "A Multi-Computer Neural Network Applied to
-                 Machine-Vision",
-  volume =       "2",
-  address =      "Perth, Australia",
-  pages =        "1150--1153",
-  year =         "1995",
-}
-
-@InProceedings{Hsu88,
-  author =       "K. Hsu and D. Brady and D. Psaltis",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Experimental Demonstration of Optical Neural
-                 Computers",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "377--386",
-  year =         "1988",
-}
-
-@Article{huang04dynamic,
-  author =       "X. Huang and F. Peng and A. An and D. Schuurmans",
-  title =        "Dynamic web log session identification with
-                 statistical language models",
-  journal =      "Journal of the American Society for Information
-                 Science and Technology",
-  volume =       "55",
-  number =       "14",
-  pages =        "1290--1303",
-  year =         "2004",
-}
-
-@Book{Huang87,
-  author =       "K. Huang",
-  title =        "Statistical Mechanics",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1987",
-}
-
-@InProceedings{Huang88,
-  author =       "W. Y. Huang and R. P. Lippmann",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Neural Net and Traditional Classifiers",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "387--396",
-  year =         "1988",
-}
-
-@TechReport{Huang89,
-  author =       "X. D. Huang and H. W. Hon and K. F. Lee",
-  title =        "Multiple Codebook Semi-Continuous Hidden {Markov}
-                 Models for Speaker-Independent Continuous Speech
-                 Recognition",
-  number =       "CMU-CS-89-136",
-  institution =  "School of Computer Science Carnegie-Mellon
-                 University",
-  address =      "Pittburgh, Pensylvania",
-  month =        apr,
-  year =         "1989",
-}
-
-@InProceedings{Huang90,
-  author =       "Xuedong Huang and Kai-Fu Lee and Hsiao-Wuen Hon",
-  booktitle =    icassp,
-  title =        "On Semi-Continuous Hidden {Markov} Modeling",
-  pages =        "689--692",
-  year =         "1990",
-}
-
-@article{Hubel+Wiesel-1959,
-    title = {Receptive Fields of Single Neurons in the Cat's Striate Cortex},
-    author = {David H. Hubel and Torsten N. Wiesel},
-    journal = {Journal of Physiology},
-    pages = {574--591},
-    volume = {148},
-    year = {1959},
-    biburl = {http://www.bibsonomy.org/bibtex/202c5cf1ee910eadba5efa77b3cd043f6/idsia},
-}
-
-@Article{Hubel62,
-  author =       "D. H. Hubel and T. N. Wiesel",
-  title =        "Receptive Fields, Binocular Interaction, and Functional Architecture in the Cat's Visual Cortex",
-  journal =      jphysiol,
-  volume =       "160",
-  pages =        "106--154",
-  year =         "1962",
-}
-
-@article{Hubel+Wiesel-1968,
- author = {D.H. Hubel and T.N. Wiesel},
- title = {Receptive fields and functional architecture of monkey striate cortex},
- journal = jphysiol,
- volume = 195,
- pages = {215--243},
- year = 1968,
-}
-
-@article{Huber-1985,
-    author = {Huber, Peter  J. },
-    comment = {Projection Pursuit},
-    journal = {The Annals of Statistics},
-    number = {2},
-    pages = {435--475},
-    title = {Projection Pursuit},
-    url = {http://www.jstor.org/stable/2241175},
-    volume = {13},
-    year = {1985}
-}
-
-@InProceedings{Hueter88,
-  author =       "G. J. Hueter",
-  booktitle =    icnn,
-  title =        "Solution of the Travelling Salesman Problem with an
-                 Adaptive Ring",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "85--92",
-  year =         "1988",
-}
-
-@InProceedings{Hush88,
-  author =       "D. R. Hush and J. M. Salas",
-  booktitle =    icnn,
-  title =        "Improving the Learning Rate of Back-Propagation with
-                 the Gradient Reuse Algorithm",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "441--447",
-  year =         "1988",
-}
-
-@Article{Hush92,
-  author =       "D. R. Hush and B. Horne and J. M. Solas",
-  title =        "Error Surfaces for Multilayer Perceptrons",
-  journal =      ieeesmc,
-  volume =       "22",
-  number =       "5",
-  pages =        "1152--1161",
-  month =        sep,
-  year =         "1992",
-}
-
-@InCollection{Hutchins+Hazlehurst-02,
-  author =       "Edwin Hutchins and Brian Hazlehurst",
-  editor =       "A. Cangelosi and D. Parisi",
-  booktitle =    "Simulating the Evolution of Language",
-  title =        "Auto-organization and Emergence of Shared Language
-                 Structure",
-  publisher =    "London: Springer-Verlag",
-  pages =        "279--305",
-  year =         "2002",
-}
-
-@InCollection{Hutchins+Hazlehurst-95,
-  author =       "Edwin Hutchins and Brian Hazlehurst",
-  editor =       "N. Gilbert and R. Conte",
-  booktitle =    "Artificial Societies: the computer simulation of
-                 social life",
-  title =        "How to invent a lexicon: the development of shared
-                 symbols in interaction",
-  publisher =    "London: UCL Press",
-  pages =        "157--189",
-  year =         "1995",
-}
-
-@Article{Hutchinson94,
-  author =       "J. M. Hutchinson and A. W. Lo and T. Poggio",
-  title =        "{A Nonparametric Approach to Pricing and Hedging
-                 Derivative Securities Via Learning Networks}",
-  journal =      "Journal of Finance",
-  volume =       "49",
-  number =       "3",
-  pages =        "851--889",
-  year =         "1994",
-}
-
-@Book{Hutter2005,
-  author =       "Marcus Hutter",
-  title =        "Universal Artificial Intelligence: Sequential
-                 Decisions based on Algorithmic Probability",
-  publisher =    "Springer, Berlin",
-  year =         "2005",
-}
-
-@Article{Hwang+al-1992,
-  author =       "Frank K. Hwang and Dana Richards and Pawel Winter",
-  title =        "The {Steiner} Tree Problem",
-  journal =      "Annals of Discrete Mathematics",
-  volume =       "53",
-  publisher =    "Elsevier",
-  address =      "Amsterdam",
-  year =         "1992",
-}
-
-@article{Hyvarinen-1999,
-    author = {Hyv\"arinen, A. },
-    journal = {Neural Computing Surveys},
-    keywords = {ica, separation, waspaa07bib},
-    pages = {94--128},
-    title = {Survey on Independent Component Analysis},
-    url = {http://citeseer.ist.psu.edu/223687.html},
-    volume = {2},
-    year = {1999}
-}
-
-@book{Hyvarinen-2001,
-    author = {Hyv{\"{a}}rinen, Aapo   and Karhunen, Juha   and Oja, Erkki  },
-    howpublished = {Hardcover},
-    isbn = {047140540X},
-    month = {May},
-    posted-at = {2008-07-02 02:13:00},
-    priority = {2},
-    publisher = {Wiley-Interscience},
-    title = {Independent Component Analysis},
-    url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/047140540X},
-    year = {2001}
-}
-
-@Article{Hyvarinen+al-01,
-  author =       "Aapo Hyv{\"{a}}rinen and Patrik O. Hoyer and Mika
-                 Inki",
-  title =        "Topographic Independent Component Analysis",
-  journal =      "Neural Computation",
-  volume =       "13",
-  number =       "7",
-  pages =        "1527--1558",
-  year =         "2001",
-}
-
-@Article{HyvarinenA2001,
-  author =       "Aapo Hyv{\"{a}}rinen and Patrik O. Hoyer and Mika O.
-                 Inki",
-  title =        "Topographic Independent Component Analysis",
-  journal =      "Neural Computation",
-  volume =       "13",
-  number =       "7",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA, USA",
-  pages =        "1527--1558",
-  year =         "2001",
-  ISSN =         "0899-7667",
-}
-
-@Article{HyvarinenA2001-small,
-  author =       "Aapo Hyv{\"{a}}rinen and Patrick O. Hoyer and Mika O. Inki",
-  title =        "Topographic Independent Component Analysis",
-  journal =      "Neural Computation",
-  volume =       "13",
-  number =       "7",
-  pages =        "1527--1558",
-  year =         "2001",
-}
-
-@Article{Hyvarinen-2005,
-  author =       "Aapo Hyv{\"{a}}rinen ",
-  title =        "Estimation of non-normalized statistical models using score matching",
-  journal =      jmlr,
-  volume =       "6",
-  pages =        "695--709",
-  year =         "2005",
-}
-
-@Article{Hyvarinen-2007,
-  author =       "Aapo Hyv{\"{a}}rinen ",
-  title =        "Some extensions of score matching",
-  journal =      "Computational Statistics and Data Analysis",
-  volume =       "51",
-  pages =        "2499--2512",
-  year =         "2007",
-}
-
-@Article{Hyvarinen-2007b,
-  author =       "Aapo Hyv{\"{a}}rinen ",
-  title =        "Connections between score matching, contrastive divergence, and pseudolikelihood
-                  for continuous-valued variables",
-  journal =      "{IEEE} Transactions on Neural Networks",
-  volume =       "18",
-  pages =        "1529--1531",
-  year =         "2007",
-}
-
-@article{HyvarinenA2008,
- author = {Hyv\"{a}rinen,, Aapo},
- title = {Optimal approximation of signal priors},
- journal = {Neural Computation},
- volume = {20},
- number = {12},
- year = {2008},
- pages = {3087--3110},
- publisher = {MIT Press},
- address = {Cambridge, MA, USA},
- }
-
-@article{kording2004,
-author={Konrad P. K{\"o}rding and Christoph Kayser and Wolfgang
-Einh{\"a}user and Peter K{\"o}nig},
-title = "How Are Complex Cell Properties Adapted to the Statistics of
-Natural Stimuli?",
-year = 2004,
-journal = "Journal of Neurophysiology",
-volume = 91,
-pages = {206--212},
-url="jn.physiology.org/cgi/reprint/91/1/206.pdf"
-}
-
-@inproceedings{Koster-Hyvarinen-2007,
-  author = {Urs K{\"{o}}ster and Aapo Hyv{\"{a}}rinen},
- title = {A two-layer {ICA}-like model estimated by {S}core {M}atching},
- booktitle = {Int. Conf. Artificial Neural Networks (ICANN'2007)},
- pages = {798--807},
- year = 2007,
-}
-
-@article{Iba-2001,
-  author =       "Yukito Iba",
-  title =        "Extended Ensemble Monte Carlo",
-  journal =      "International Journal of Modern Physics",
-  volume =       "C12",
-  pages =        "623--656",
-  year =         "2001",
-}
-
-@InProceedings{icml2009_093,
-  author =    {Hossein Mobahi and Ronan Collobert and Jason Weston},
-  title =     {Deep Learning from Temporal Coherence in Video},
-  booktitle = {Proceedings of the 26th International Conference on Machine Learning},
-  pages =     {737--744},
-  year =      2009,
-  editor =    {L\'{e}on Bottou and Michael Littman},
-  address =   {Montreal},
-  month =     {June},
-  publisher = {Omnipress}
-}
-
-@InProceedings{icann:Holger+Yoshua:1997,
-  author =       "Holger Schwenk and Yoshua Bengio",
-  booktitle =    "International Conference on Artificial Neural
-                 Networks",
-  title =        "{AdaBoosting} Neural Networks: Application to on-line
-                 Character Recognition",
-  publisher =    "Springer Verlag",
-  pages =        "967--972",
-  year =         "1997",
-}
-
-@Article{Ide1998,
-  author =       "Nancy Ide and Jean Veronis",
-  title =        "Introduction to the Special Issue on Word Sense
-                 Disambiguation: The State of the Art",
-  journal =      "Computational Linguistics",
-  volume =       "24",
-  number =       "1",
-  pages =        "1--40",
-  year =         "1998",
-}
-
-@Article{IEEE-KDE:Frasconi95,
-  author =       "P. Frasconi and M. Gori and M. Maggini and G. Soda",
-  title =        "Unified Integration of Explicit Rules and Learning by
-                 Example in Recurrent Networks",
-  journal =      "IEEE Transactions on Knowledge and Data Engineering",
-  volume =       "7",
-  number =       "2",
-  pages =        "340--346",
-  year =         "1995",
-  OPTmonth =     "",
-}
-
-@Article{igel05,
-  author =       "C. Igel and M. Toussaint and W. Weishui",
-  title =        "Rprop using the natural gradient compared to
-                 Levenberg-Marquardt optimization",
-  journal =      "Trends and Applications in Constructive Approximation.
-                 International Series of Numerical Mathematics.",
-  volume =       "151",
-  publisher =    "Birkhäuser Verlag",
-  pages =        "259--272",
-  year =         "2005",
-}
-
-@Article{intrator,
-  author =       "Nathan Intrator and Shimon Edelman",
-  title =        "How to make a low-dimensional representation suitable
-                 for diverse tasks",
-  journal =      "Connection Science, Special issue on Transfer in
-                 Neural Networks",
-  volume =       "8",
-  pages =        "205--224",
-  year =         "1996",
-}
-
-@Article{intrator96,
-  author =       "Nathan Intrator and Shimon Edelman",
-  title =        "How to make a low-dimensional representation suitable
-                 for diverse tasks",
-  journal =      "Connection Science, Special issue on Transfer in
-                 Neural Networks",
-  volume =       "8",
-  pages =        "205--224",
-  year =         "1996",
-}
-
-@Article{Inzenman-91,
-  author =       "A. J. Inzenman",
-  title =        "Recent developments in nonparametric density
-                 estimation",
-  journal =      "Journal of the American Statistical Association",
-  volume =       "86",
-  number =       "413",
-  pages =        "205--224",
-  year =         "1991",
-}
-
-@TechReport{IOHMM-TR,
-  author =       "Y. Bengio and P. Frasconi",
-  title =        "An {EM} Approach to Learning Sequential Behavior",
-  number =       "RT-DSI-11/94",
-  institution =  "University of Florence",
-  year =         "1994",
-}
-
-@InProceedings{Irie88,
-  author =       "B. Irie and S. Miyake",
-  booktitle =    "IEEE Second International Conference on Neural
-                 Networks, San Diego",
-  title =        "Capabilities of three layer perceptrons",
-  year =         "1988",
-}
-
-@Article{Irino+Kawahara90,
-  author =       "T. Irino and H. Kawahara",
-  title =        "A Method for Designing Neural Networks Using Nonlinear
-                 Multivariate Analysis: Application to
-                 Speaker-Independent Vowel Recognition",
-  journal =      "Neural Computation",
-  volume =       "2",
-  type =         "Letter",
-  number =       "3",
-  pages =        "386--397",
-  year =         "1990",
-}
-
-@article{ItoM2004,
-	author = {Ito, Minami   and Komatsu, Hidehiko  },
-	citeulike-article-id = {451606},
-	doi = {http://dx.doi.org/10.1523/JNEUROSCI.4364},
-	journal = {Journal of Neuroscience},
-	keywords = {cnv, v2},
-	month = {March},
-	number = {13},
-	pages = {3313--3324},
-	posted-at = {2007-03-30 11:19:11},
-	priority = {0},
-	title = {Representation of Angles Embedded within Contour Stimuli in Area V2 of Macaque Monkeys},
-	volume = {24},
-	year = {2004}
-}
-	%url = {http://dx.doi.org/10.1523/JNEUROSCI.4364},
-
-@Article{Jaakkola+Jordan99,
-  author =       "T. Jaakkola and M. I. Jordan",
-  title =        "Varitional methods and the {QMR}-{DT} database",
-  journal =      "Journal of Artificial Intelligence",
-  volume =       "10",
-  pages =        "291--322",
-  year =         "1999",
-}
-
-%I deprecated because the year in the tag is wrong
-@InProceedings{Jaakkola98,
-  author =       "Tommi S. Jaakkola and David Haussler",
-  editor =       NIPS11ed,
-  booktitle =    NIPS11,
-  title =        "Exploiting generative models in discriminative
-                 classifiers",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "487--493",
-  year =         "1999",
-}
-
-@InProceedings{Jaakkola99,
-  author =       "Tommi S. Jaakkola and David Haussler",
-  editor =       NIPS11ed,
-  booktitle =    NIPS11,
-  title =        "Exploiting generative models in discriminative
-                 classifiers",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "487--493",
-  year =         "1999",
-}
-
-@Misc{jaakkola98exploiting,
-  author =       "T. Jaakkola and D. Haussler",
-  title =        "Exploiting generative models in discriminative
-                 classifiers",
-  year =         "1998",
-  note =         "Preprint, Dept.of Computer Science, Univ. of California. 
-                  A shorter version is in Advances in Neural
-                  Information Processing Systems 11",
-  howpublished = "Available from http://www.cse.ucsc.edu/~haussler/pubs.html",
-}
-
-@Article{Jacobs-nc91,
-  author =       "R. A. Jacobs and M. I. Jordan and S. J. Nowlan and G.
-                 E. Hinton",
-  title =        "Adaptive mixture of local experts",
-  journal =      "Neural Computation",
-  volume =       "3",
-  pages =        "79--87",
-  year =         "1991",
-}
-
-@InCollection{Jacobs-nips91,
-  author =       "R. A. Jacobs and M. I. Jordan",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "A competitive modular connectionist architecture",
-  publisher =    "Morgan Kaufman Publishers",
-  address =      "San Mateo, CA",
-  year =         "1991",
-}
-
-@TechReport{Jacobs-tr90,
-  author =       "R. A. Jacobs and M. I. Jordan and A. G. Barto",
-  title =        "Task Decomposition Through Competition in a Modular
-                 Connectionist Architecture: The {What} and {Where}
-                 Vision Tasks",
-  number =       "COINS 90-27",
-  institution =  "MIT",
-  address =      "Cambridge MA",
-  year =         "1990",
-}
-
-@Article{Jacobs88,
-  author =       "R. A. Jacobs",
-  title =        "Increased Rates of Convergence Through Learning Rate
-                 Adaptation",
-  journal =      nn,
-  volume =       "1",
-  pages =        "295--307",
-  year =         "1988",
-}
-
-@Article{Jacobs91a,
-  author =       "Robert A. Jacobs and Michael I. Jordan and Steven J.
-                 Nowlan and Geoffrey E. Hinton",
-  title =        "Adaptive Mixtures of Local Experts",
-  journal =      nc,
-  volume =       "3",
-  pages =        "79--87",
-  year =         "1991",
-}
-
-@Article{Jacobs91b,
-  author =       "R. A. Jacobs and M. I. Jordan and A. G. Barto",
-  title =        "Task Decomposition Through Competition in a Modular
-                 Connectionist Architecture: The What and Where Vision
-                 Task",
-  journal =      "Cognitive Science",
-  volume =       "15",
-  pages =        "219--250",
-  year =         "1991",
-}
-
-@Article{Jacobs94,
-  author =       "R. A. Jacobs and S. M. Kosslyn",
-  title =        "Encoding Shape and Spatial Relations: The Role of
-                 Receptive Fields in Coordinating Complementary
-                 Representations",
-  journal =      "Cognitive Science",
-  year =         "1994",
-}
-
-@article{Jaeger-2007,
-    author = {Herbert Jaeger},
-    title = {Echo state network},
-    year = 2007,
-    journal = {Scholarpedia},
-    volume = 2,
-    number = 9,
-    pages = 2330,
-}
-
-@Article{Japkowicz2000,
-  author =       "Nathalie Japkowicz and Stephen J. Hanson and Mark A.
-                 Gluck",
-  title =        "Nonlinear Autoassociation is not Equivalent to {PCA}",
-  journal =      "Neural Computation",
-  volume =       "12",
-  number =       "3",
-  pages =        "531--545",
-  year =         "2000",
-}
-
-@Article{Japkowicz2002,
-  author =       "N. Japkowicz and S. Stephen",
-  title =        "The Class Imbalance Problem: {A} Systematic Study",
-  journal =      "Intelligent Data Analysis",
-  volume =       "6",
-  number =       "5",
-  year =         "2002",
-}
-
-@inproceedings {Jarrett-ICCV2009,
- original = "orig/jarrett-iccv-09.pdf",
- title = "What is the Best Multi-Stage Architecture for Object Recognition?",
- author = "Jarrett, Kevin and Kavukcuoglu, Koray and Ranzato, {Marc'Aurelio} and {LeCun}, Yann",
- booktitle = "Proc. International Conference on Computer Vision (ICCV'09)",
- publisher = "IEEE",
- year = "2009"
-}
-
-@TechReport{Jauvin+Bengio-TR2003,
-  author =       "Christian Jauvin and Yoshua Bengio",
-  title =        "A Sense-Smoothed Bigram Language Model",
-  number =       "1233",
-  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
-  year =         "2003",
-}
-
-@Book{Jaynes03,
-  author =       "E. T. Jaynes",
-  title =        "{Probability} {Theory}: {The} {Logic} of {Science}",
-  publisher =    "Cambridge University Press",
-  year =         "2003",
-}
-
-@InCollection{Jaynes83,
-  author =       "E. T. Jaynes",
-  booktitle =    "Papers on Probability, Statistics and Statistical
-                 Physics",
-  title =        "{Bayesian} intervals versus confidence intervals",
-  publisher =    "Kluwer",
-  year =         "1983",
-  editors =      "R. D. Rosencrantz",
-}
-
-@Article{JCB:Baldi95t,
-  author =       "Y. Chauvin and P. Baldi",
-  title =        "Hidden Markov models of the {G}-Protein-Coupled
-                 receptor family",
-  journal =      "Journal of Computational Biology",
-  year =         "1995",
-}
-
-@InProceedings{jebara03,
-  author =       "Tony Jebara and Risi Kondor",
-  booktitle =    colt03,
-  title =        "{Bhattacharyya and Expected Likelihood Kernels}",
-  year =         "2003",
-}
-
-@InProceedings{Jebara03Convex,
-  author =       "T. Jebara",
-  editor =       "",
-  booktitle =    "Proceedings of AISTATS 2003",
-  title =        "Convex Invariance Learning",
-  publisher =    "",
-  pages =        "",
-  year =         "2003",
-}
-
-@InProceedings{jebara04,
-  author =       "Tony Jebara",
-  booktitle =    ICML04,
-  editor =       ICML04ed,
-  publisher =    ICML04publ,
-  title =        "{Multi-task feature and kernel selection for SVMs}",
-  address =      "New York, NY, USA",
-  year =         "2004",
-  location =     "Banff, Alberta, Canada",
-}
-
-@Book{JebaraT2003,
-  author =       "Tony Jebara",
-  title =        "Machine Learning: Discriminative and Generative (The
-                 Kluwer International Series in Engineering and Computer
-                 Science)",
-  howpublished = "Hardcover",
-  publisher =    "Springer",
-  month =        dec,
-  year =         "2003",
-  citeulike-article-id = "134203",
-  comment =      "- maximum entropy discriminative as unification of
-                 discriminative and generative approaches",
-  keywords =     "book, generative-discriminative, svm",
-  priority =     "2",
-}
-  %ISBN =         "1402076479",
-  %URL =          "http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20&path=ASIN/1402076479",
-
-@InCollection{Jelinek+Mercer80,
-  author =       "F. Jelinek and R. L. Mercer",
-  editor =       "E. S. Gelsema and L. N. Kanal",
-  booktitle =    "Pattern Recognition in Practice",
-  title =        "Interpolated estimation of Markov source parameters
-                 from sparse data",
-  publisher =    "North-Holland, Amsterdam",
-  year =         "1980",
-}
-
-@InProceedings{Jelinek-Chelba-99,
-  author =       "Frederick Jelinek and Ciprian Chelba",
-  booktitle =    "European Conference on Speech Communication and
-                 Technology",
-  title =        "Putting language into language modeling",
-  volume =       "1",
-  address =      "Budapest",
-  pages =        "KN1--KN5",
-  year =         "1999",
-}
-
-@Article{Jelinek76,
-  author =       "F. Jelinek",
-  title =        "Continuous speech recognition by statistical methods",
-  journal =      "Proceedings of the IEEE",
-  volume =       "64",
-  pages =        "532--556",
-  year =         "1976",
-}
-
-@InCollection{Jelinek80,
-  author =       "F. Jelinek and R. L. Mercer",
-  editor =       "E. S. Gelsema and L. N. Kanal",
-  booktitle =    "Pattern Recognition in Practice",
-  title =        "Interpolated Estimation of {Markov} Source Parameters
-                 from Sparse Data",
-  publisher =    "North-Holland",
-  address =      "Amsterdam",
-  year =         "1980",
-  copy =         yes,
-}
-
-@Book{Jelinek98,
-  author =       "F. Jelinek",
-  title =        "Statistical Methods for Speech Recognition",
-  publisher =    "MIT Press",
-  address =      "Cambridge, Massachussetts",
-  year =         "1998",
-}
-
-@InProceedings{JensenRiis2000,
-  author =       "K. J. Jensen and S. Riis",
-  booktitle =    "International Conference on Spoken Language
-                 Processing",
-  title =        "Self-organizing letter code-book for text-to-phoneme
-                 neural network model",
-  volume =       "3",
-  pages =        "318--321",
-  year =         "2000",
-}
-
-@InProceedings{Jeong96,
-  author =       "E. Jeong and K. Furuta and S. Kondo",
-  booktitle =    nipc-hmit96,
-  title =        "Identification of Transient in Nuclear Power Plant
-                 using Adaptive Template Matching with Neural Network",
-  volume =       "1",
-  publisher =    ans,
-  pages =        "243--250",
-  year =         "1996",
-}
-
-@InCollection{joachims99largescaleSVM,
-  author =       "T. Joachims",
-  editor =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
-  booktitle =    "Advances in Kernel Methods --- Support Vector
-                 Learning",
-  title =        "Making large-Scale {SVM} Learning Practical",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "1999",
-}
-
-@InProceedings{joachims99transductive,
-  author =       "Thorsten Joachims",
-  booktitle =    ICML99,
-  editor =       ICML99ed,
-  publisher =    ICML99publ,
-  title =        "Transductive Inference for Text Classification using
-                 Support Vector Machines",
-  address =      "Bled, SL",
-  pages =        "200--209",
-  year =         "1999",
-}
-  %URL =          "citeseer.ist.psu.edu/joachims99transductive.html",
-
-@TechReport{Johansson90,
-  author =       "E. M. Johansson and F. U. Dowla and D. M. Goodman",
-  title =        "Backpropagation learning for multi-layer feed-forward
-                 neural networks using the conjugate gradient method",
-  number =       "UCRL-JC-104850",
-  institution =  "Lawrence Livermore National Laboratory",
-  month =        sep,
-  year =         "1990",
-}
-
-@inproceedings{John+al-1994,
-    author = {John, George  H.  and Kohavi, Ron  and Pfleger, Karl},
-    booktitle = {Proceedings of the Eleventh International Conference on Machine Learning},
-    pages = {121--129},
-    title = {Irrelevant Features and the Subset Selection Problem},
-    url = {http://citeseer.ist.psu.edu/john94irrelevant.html},
-    year = {1994},
-    publisher = {Morgan Kaufmann},
-}
-
-@Article{Johnson89,
-  author =       "D. S. Johnson and C. R. Aragon and L. A. McGeoch and
-                 C. Schevon",
-  title =        "Optimization by Simulated Annealing: An Experimental
-                 Evaluation; Part {I}, Graph Partitioning",
-  journal =      opres,
-  volume =       "37",
-  pages =        "865--891",
-  year =         "1989",
-}
-
-@InProceedings{Joines92QQ23,
-  author =       "J. A. Joines and M. W. White",
-  booktitle =    "IJCNN",
-  title =        "Improved Generalization Using Robust Cost Functions",
-  address =      "Baltimore, Maryland",
-  pages =        "911--918",
-  month =        jun,
-  year =         "1992",
-  ref =          "QQ23",
-}
-
-@Book{Jolliffe86,
-  author =       "Ian T. Jolliffe",
-  title =        "Principal Component Analysis",
-  publisher =    "Springer-Verlag",
-  address =      "New York",
-  year =         "1986",
-}
-
-@book{Jolliffe-2002,
-    author = {Ian T. Jolliffe},
-    citeulike-article-id = {1154147},
-    howpublished = {Hardcover},
-    isbn = {0387954422},
-    month = {October},
-    posted-at = {2007-03-11 15:04:57},
-    priority = {2},
-    publisher = {Springer},
-    title = {Principal Component Analysis},
-    url = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387954422},
-    year = {2002}
-}
-
-@Article{Jordan+Jacobs94,
-  author =       "M. I. Jordan and R. A. Jacobs",
-  title =        "Hierarchical mixtures of experts and the {E}{M}
-                 algorithm",
-  journal =      nc,
-  volume =       "6",
-  pages =        "181--214",
-  year =         "1994",
-}
-
-@TechReport{Jordan+Xu93,
-  author =       "Michael I. Jordan and L. Xu",
-  title =        "Convergence results for the {EM} approach to mixtures
-                 of experts architecture",
-  number =       "9303",
-  institution =  "MIT Computational Cognitive Science",
-  month =        sep,
-  year =         "1993",
-}
-
-@Article{Jordan-cs92,
-  author =       "M. I. Jordan and D. E. Rumelhart",
-  title =        "Forward models: Supervised learning with a distal
-                 teacher",
-  journal =      "Cognitive Science",
-  volume =       "16",
-  pages =        "307--354",
-  year =         "1992",
-}
-
-@InProceedings{Jordan-HMDT97,
-  author =       "M. Jordan and Z. Ghahramani and L. Saul",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "Hidden Markov decision trees",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "",
-  year =         "1997",
-}
-
-@InProceedings{Jordan-nips92,
-  author =       "M. I. Jordan and R. A. Jacobs",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Hierarchies of adaptive experts",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "985--992",
-  year =         "1992",
-}
-
-@TechReport{Jordan-tr86,
-  author =       "M. I. Jordan",
-  title =        "Serial Order: a Parallel Distributed Processing
-                 Approach",
-  number =       "8604",
-  institution =  "ICS (Institute for Cognitive Science, University of
-                 California)",
-  year =         "1986",
-}
-
-@InProceedings{Jordan86,
-  author =       "M. I. Jordan",
-  booktitle =    "Proceedings of the Eighth Annual Conference of the
-                 Cognitive Science Society",
-  title =        "Attractor Dynamics and Parallelism in a Connectionist
-                 Sequential Machine",
-  publisher =    "Lawrence Erlbaum, Hillsdale",
-  address =      "Amherst 1986",
-  pages =        "531--546",
-  year =         "1986",
-}
-
-@TechReport{Jordan88,
-  author =       "M. I. Jordan",
-  title =        "Supervised Learning and Systems with Excess Degrees of
-                 Freedom",
-  number =       "COINS Technical Report 88-27",
-  institution =  "MIT",
-  address =      "Cambridge MA",
-  year =         "1988",
-}
-
-@InCollection{Jordan89,
-  author =       "M. I. Jordan",
-  editor =       "J. L. Elman and D. E. Rumelhart",
-  booktitle =    "Advances in Connectionist Theory: Speech",
-  title =        "Serial Order: {A} Parallel, Distributed Processing
-                 Approach",
-  publisher =    "Lawrence Erlbaum",
-  address =      "Hillsdale",
-  year =         "1989",
-}
-
-@InProceedings{Jordan89b,
-  author =       "M. I. Jordan",
-  editor =       "G. Hinton and D. S. Touretzky",
-  booktitle =    "Proceedings of the 1988 Connectionist Models Summer
-                 School",
-  title =        "Supervised learning and systems with excess degrees of
-                 freedom",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1989",
-}
-
-@InCollection{Jordan90,
-  author =       "M. I. Jordan",
-  editor =       "M. Jeannerod",
-  booktitle =    "Attention and Performance XIII",
-  title =        "Motor learning and the degrees of freedom problem",
-  publisher =    "Hillsdale, NJ: Erlbaum",
-  year =         "1990",
-}
-
-@Book{Jordan98,
-  author =       "M. I. Jordan",
-  title =        "Learning in Graphical Models",
-  publisher =    "Kluwer",
-  address =      "Dordrecht, Netherlands",
-  year =         "1998",
-}
-
-@Article{Jour:Freund:AdaBoostDetailed,
-  author =       "Yoav Freund and Robert E. Schapire",
-  title =        "A decision theoretic generalization of on-line
-                 learning and an application to Boosting",
-  journal =      "Journal of Computer and System Science",
-  volume =       "55",
-  number =       "1",
-  pages =        "119--139",
-  year =         "1997",
-}
-
-@Article{Jour:Freund:boost,
-  author =       "Yoav Freund",
-  title =        "Boosting a weak learning algorithm by majority",
-  journal =      "Information and Computation",
-  volume =       "121",
-  number =       "2",
-  pages =        "256--285",
-  year =         "1995",
-}
-
-@Article{Jour-Freund-AdaBoostDetailed,
-  author =       "Yoav Freund and Robert E. Schapire",
-  title =        "A decision theoretic generalization of on-line
-                 learning and an application to Boosting",
-  journal =      "Journal of Computer and System Science",
-  volume =       "55",
-  number =       "1",
-  pages =        "119--139",
-  year =         "1997",
-}
-
-@PhdThesis{Jouvet88,
-  author =       "D. Jouvet",
-  title =        "Reconnaissance de Mots Connectes Independamment du
-                 Locuteur par des Methodes Statistiques",
-  number =       "NST-88E006",
-  school =       "Ecole National Superieure des Telecommunications",
-  year =         "1988",
-}
-
-@inproceedings{JuanA2001,
- author = {Alfons Juan and Enrique Vidal},
- title = {On the use of Bernoulli Mixture Models for Text Classification},
- booktitle = {PRIS '01: Proceedings of the 1st International Workshop on Pattern Recognition in Information Systems},
- year = {2001},
- pages = {118--126},
- publisher = {ICEIS Press},
- }
-
-@inproceedings{JuanA2004,
- author = {Alfons Juan and Enrique Vidal},
- title = {Bernoulli Mixture Models for Binary Images},
- booktitle = {ICPR '04: Proceedings of the Pattern Recognition, 17th International Conference on (ICPR'04) Volume 3},
- year = {2004},
- pages = {367--370},
- publisher = {IEEE Computer Society},
- address = {Washington, DC, USA},
- }
-
-@Article{Juang92,
-  author =       "B. H. Juang and S. Katagiri",
-  title =        "Discriminative learning for minimum error
-                 classification",
-  journal =      "IEEE Transactions on Signal Processing",
-  volume =       "40",
-  number =       "12",
-  pages =        "3043--3054",
-  year =         "1992",
-}
-
-@Article{Judd88,
-  author =       "S. Judd",
-  title =        "On the complexity of loading shallow neural networks",
-  journal =      "Journal of Complexity",
-  volume =       "4",
-  pages =        "177--192",
-  year =         "1988",
-}
-
-@Book{JuddBook,
-  author =       "J. S. Judd",
-  title =        "Neural Network Design and the Complexity of Learning",
-  publisher =    "MIT press",
-  year =         "1989",
-}
-
-@book{Jurafsky+Martin-2008,
-    author = {Jurafsky, Daniel and Martin, James  H.},
-    howpublished = {Hardcover},
-    month = {January},
-    publisher = {Prentice Hall},
-    edition = 2,
-    title = {Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics and Speech Recognition},
-    year = {2008}
-}
-
-@Article{Jutten+Herault-91,
-  author =       "Christian Jutten and Jeanny Herault",
-  title =        "Blind separation of sources, part {I}: an adaptive
-                 algorithm based on neuromimetic architecture",
-  journal =      "Signal Processing",
-  volume =       "24",
-  pages =        "1--10",
-  year =         "1991",
-}
-
-@InProceedings{Kahng89,
-  author =       "A. B. Kahng",
-  booktitle =    ijcnn,
-  title =        "Travelling Salesman Heuristics and Embedding Dimension
-                 in the Hopfield Model",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "513--520",
-  year =         "1989",
-}
-
-@InProceedings{kai03,
-  author =       "Yu Kai and Schwaighofer Anton and Tresp Volker and Ma
-                 Wei-Ying and Zhang HongJiang",
-  booktitle =    UAI03,
-  title =        "Collaborative Ensemble Learning: Combining
-                 Collaborative and Content-Based Information Filtering
-                 via Hierarchical Bayes",
-  publisher =    "Morgan Kaufmann Publishers",
-  address =      "San Francisco, CA",
-  pages =        "616--623",
-  year =         "2003",
-}
-
-@Article{Kalman61,
-  author =       "R. Kalman and R. S. Bucy",
-  title =        "New results in linear filtering and prediction",
-  journal =      "Journal of Basic Engineering (ASME)",
-  volume =       "83D",
-  pages =        "95--108",
-  year =         "1961",
-}
-
-@article{Kambhatla+Leen-1997,
-    author = {Kambhatla, N.  and Leen, T. K. },
-    journal = {Neural Computation},
-    pages = {1493--1516},
-    title = {Dimension Reduction by Local Principal Component Analysis},
-    volume = {9},
-    year = {1997}
-}
-
-@Article{Kammen88,
-  author =       "D. M. Kammen and A. L. Yuille",
-  title =        "Spontaneous Symmetry-Breaking Energy Functions and the
-                 Emergence of Orientation Selective Cortical Cells",
-  journal =      biocyb,
-  volume =       "59",
-  pages =        "23--31",
-  year =         "1988",
-}
-
-@InProceedings{Kammerer89,
-  author =       "B. K. Kammerer and W. A. Kupper",
-  booktitle =    ijcnn,
-  title =        "Design of Hierarchical Perceptron Structures and their
-                 Application to the Task of Isolated Word Recognition",
-  address =      "Washington D.C.",
-  year =         "1989",
-}
-
-@Book{Kandel85,
-  author =       "E. R. Kandel and J. H. Schwartz",
-  title =        "Principles of Neural Science",
-  publisher =    "Elsevier",
-  address =      "New York",
-  edition =      "2",
-  year =         "1985",
-}
-
-@Article{Kanter87,
-  author =       "I. Kanter and H. Sompolinsky",
-  title =        "Associative Recall of Memory Without Errors",
-  journal =      prA,
-  volume =       "35",
-  pages =        "380--392",
-  year =         "1987",
-}
-
-@inproceedings{KarklinY2003,
-  author    = {Yan Karklin and
-               Michael S. Lewicki},
-  title     = {A Model for Learning Variance Components of Natural Images},
-  year      = {2003},
-  pages     = {1367-1374},
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  publisher =    "{MIT} Press",
-}
-
-@Article{Karmin90,
-  author =       "E. D. Karmin",
-  title =        "A simple procedure for pruning back-propagation
-                 trained neural networks",
-  journal =      ieeetrnn,
-  volume =       "1",
-  number =       "2",
-  pages =        "239--242",
-  year =         "1990",
-}
-
-@Article{Karplus97,
-  author =       "K. Karplus and K. Sjolander and C. Barrett and M.
-                 Cline and D. Haussler and R. Hughey and L. Holm and C.
-                 Sander",
-  title =        "Predicting protein structure using hidden Markov
-                 models",
-  journal =      "Proteins: Structure, Function and Genetics",
-  volume =       "S 1",
-  number =       "1",
-  pages =        "134--139",
-  year =         "1997",
-}
-
-@PhdThesis{KasselR1995,
-  author = 	 {Robert Kassel},
-  title = 	 {A Comparison of Approaches to On-line Handwritten Character Recognition},
-  school = 	 {MIT Spoken Language Systems Group},
-  year = 	 {1995},
-}
-
-@Article{Katz87,
-  author =       "Slava M. Katz",
-  title =        "Estimation of Probabilities from Sparse Data for the
-                 Language Model Component of a Speech Recognizer",
-  journal =      "IEEE Transactions on Acoustics, Speech, and Signal
-                 Processing",
-  volume =       "ASSP-35",
-  number =       "3",
-  pages =        "400--401",
-  month =        mar,
-  year =         "1987",
-}
-
-@InCollection{Kaul,
-  author =       "G. Kaul",
-  editor =       "G. S. Maddala and C. R. Rao",
-  booktitle =    "Handbook of Statistics, Vol. 14",
-  title =        "Predictable Components in Stock Returns",
-  publisher =    "Elsevier Science",
-  pages =        "269--296",
-  year =         "1996",
-}
-
-@InProceedings{kbnn-craven.mlc93,
-  author =       "Mark W. Craven and Jude W. Shavlik",
-  booktitle =    "Proceedings of the Tenth International Conference on
-                 Machine Learning",
-  title =        "Learning Symbolic Rules Using Artificial Neural
-                 Networks",
-  publisher =    "Morgan Kaufmann",
-  address =      "Amherst, MA",
-  pages =        "73--80",
-  year =         "1993",
-}
-
-@InProceedings{kbnn-maclin.aaai92,
-  author =       "R. Maclin and J. Shavlik",
-  booktitle =    "Proceedings of the Tenth National Conference on
-                 Artificial Intelligence",
-  title =        "Using Knowledge-Based Neural Networks to Improve
-                 Algorithms: Refining the Chou-Fasman Algorithm for
-                 Protein Folding",
-  address =      "San Jose, CA",
-  pages =        "165--170",
-  year =         "1992",
-}
-
-@TechReport{kbnn-maclin.mlrgwp91,
-  author =       "R. Maclin and J. W. Shavlik",
-  title =        "Refining Algorithms with Knowledge-Based Neural
-                 Networks: Improving the Chou-Fasman Algorithm for
-                 Protein Folding",
-  number =       "Machine Learning Research Group Working Paper 91-2",
-  institution =  "Department of Computer Sciences, University of
-                 Wisconsin",
-  year =         "1991",
-  note =         "also in Computational Learning Theory and Natural
-                 Learning Systems, volume 1, S. Hanson, G. Drastal, and
-                 R. Rivest, (eds.), MIT Press",
-}
-
-@InProceedings{kbnn-noordewier.nips3,
-  author =       "Michiel O. Noordewier and Geoffrey G. Towell and Jude
-                 W. Shavlik",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "Training Knowledge-Based Neural Networks to Recognize
-                 Genes in {DNA} Sequences",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "530--536",
-  year =         "1991",
-}
-
-@InProceedings{kbnn-opitz.ijcai93,
-  author =       "D. W. Opitz and J. W. Shavlik",
-  booktitle =    "Proceedings of the Thirteenth International Joint
-                 Conference on Artificial Intelligence",
-  title =        "Heuristically Expanding Knowledge-Based Neural
-                 Networks",
-  address =      "Chambery, France",
-  month =        sep,
-  year =         "1993",
-}
-
-@TechReport{kbnn-opitz.mlrgwp92,
-  author =       "D. W. Opitz and J. W. Shavlik",
-  title =        "Using Heuristic Search to Expand Knowledge-Based
-                 Neural Networks",
-  number =       "Machine Learning Research Group Working Paper 92-1",
-  institution =  "Department of Computer Sciences, University of
-                 Wisconsin",
-  year =         "1992",
-  note =         "(also in Computational Learning Theory and Natural
-                 Learning Systems, volume 3, T. Petsche, S. Judd, and S.
-                 Hanson, (eds.), MIT Press)",
-}
-
-@TechReport{kbnn-shavlik.tr92,
-  author =       "J. W. Shavlik",
-  title =        "A Framework for Combining Symbolic and Neural
-                 Learning",
-  number =       "UW TR 1123",
-  institution =  "Department of Computer Sciences, University of
-                 Wisconsin",
-  year =         "1992",
-  note =         "(a shorter version will appear in Machine Learning)",
-}
-
-@InProceedings{kbnn-towell.aaai90,
-  author =       "G. G. Towell and J. W. Shavlik and M. O. Noordewier",
-  booktitle =    "Proceedings of the Eighth National Conference on
-                 Artificial Intelligence",
-  title =        "Refinement of Approximate Domain Theories by
-                 Knowledge-Based Neural Networks",
-  address =      "Boston, MA",
-  pages =        "861--866",
-  year =         "1990",
-}
-
-@InProceedings{kbnn-towell.aaai92,
-  author =       "G. Towell and J. Shavlik",
-  booktitle =    "Proceedings of the Tenth National Conference on
-                 Artificial Intelligence",
-  title =        "Using Symbolic Learning to Improve Knowledge-Based
-                 Neural Networks",
-  address =      "San Jose, CA",
-  pages =        "177--182",
-  year =         "1992",
-}
-
-@Article{kbnn-towell.aij94,
-  author =       "Geoffrey G. Towell and Jude W. Shavlik",
-  title =        "Knowledge-Based Neural Networks",
-  journal =      "Artificial Intelligence",
-  year =         "1994",
-  note =         "undergoing 2nd review",
-}
-
-@InCollection{kbnn-towell.ml493,
-  author =       "Geoffrey G. Towell and Jude W. Shavlik",
-  editor =       "R. S. Michalski and G. Tecuci",
-  booktitle =    "Machine Learning: An Integrated Approach",
-  title =        "Refining Symbolic Knowledge Using Neural Networks",
-  volume =       "4",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1993",
-}
-
-@InProceedings{kbnn-towell.mlc91,
-  author =       "Geoffrey G. Towell and Mark W. Craven and Jude W.
-                 Shavlik",
-  booktitle =    "Proceedings of the Eighth International Machine
-                 Learning Workshop",
-  title =        "Constructive Induction in Knowledge-Based Neural
-                 Networks",
-  publisher =    "Morgan Kaufmann",
-  address =      "Evanston, IL",
-  pages =        "213--217",
-  year =         "1991",
-}
-
-@Article{kbnn-towell.mlj93,
-  author =       "Geoffrey G. Towell and Jude W. Shavlik",
-  title =        "The Extraction of Refined Rules from Knowledge-Based
-                 Neural Networks",
-  journal =      "Machine Learning",
-  volume =       "13",
-  number =       "1",
-  pages =        "71--101",
-  year =         "1993",
-}
-
-@InProceedings{kbnn-towell.nips4,
-  author =       "Geoffrey G. Towell and Jude W. Shavlik",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Interpretation of Artificial Neural Networks: Mapping
-                 knowledge-based Neural Networks into Rules",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  year =         "1992",
-}
-
-@PhdThesis{kbnn-towell.thesis,
-  author =       "Geoffrey G. Towell",
-  title =        "Symbolic Knowledge and Neural Networks: Insertion,
-                 Refinement and Extraction",
-  school =       "University of Wisconsin -- Madison",
-  year =         "1991",
-  note =         "(Also appears as UW Technical Report 1072 [out of
-                 print].)",
-}
-
-@InProceedings{Kearns+Ron97,
-  author =       "Michael Kearns and Dana Ron",
-  booktitle =    "Tenth Annual Conference on Computational Learning
-                 Theory,",
-  title =        "Algorithmic Stability and Sanity-Check Bounds for
-                 Leave-One-Out Cross-Validation",
-  publisher =    "Morgan Kaufmann",
-  pages =        "152--162",
-  year =         "1997",
-}
-
-@InCollection{keeler-rumelhart-91,
-  author =       "J. Keeler and {W.-K.} {Rumelhart, D.and Leow}",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "integrated segmentation and recognition of
-                 hand-printed numerals",
-  publisher =    "Morgan Kaufmann Publishers, San Mateo, CA",
-  pages =        "557--563",
-  year =         "1991",
-}
-
-@Article{Keerthi+Lin-2003,
-  author =       "S. Sathiya Keerthi and Chih-Jen Lin",
-  title =        "Asymptotic Behaviors of Support Vector Machines with
-                 {Gaussian} Kernel",
-  journal =      "Neural Computation",
-  volume =       "15",
-  number =       "7",
-  pages =        "1667--1689",
-  year =         "2003",
-}
-
-@InCollection{Kegl-2003,
-  author =       "Bal\'{a}zs K\'{e}gl",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Intrinsic Dimension Estimation Using Packing Numbers",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "681--688",
-  year =         "2003",
-}
-
-@Article{Kegl-Krzyzak-2002,
-  author =       "B. Kegl and A. Krzyzak",
-  title =        "Piecewise linear skeletonization using principal
-                 curves",
-  journal =      "{IEEE} Transactions on Pattern Analysis and Machine
-                 Intelligence",
-  volume =       "24",
-  number =       "1",
-  pages =        "59--74",
-  year =         "2002",
-}
-
-@InProceedings{Kegl2003,
-  author =       "B. Kegl",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Intrinsic dimension estimation using packing numbers",
-  publisher =    "The {MIT} Press",
-  year =         "2003",
-}
-
-@InCollection{kegl2005,
-  author =       "Bal\'{a}zs K\'{e}gl and Ligen Wang",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Boosting on Manifolds: Adaptive Regularization of Base
-                 Classifiers",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2005",
-}
-
-@TechReport{Kehagias89,
-  author =       "A. Kehagias",
-  title =        "Stochastic Recurrent Networks: Prediction and
-                 Classification of Time Series",
-  institution =  "Brown University. Division of Applied Mathematics",
-  address =      "Providence, RI 02912",
-  year =         "1991",
-}
-
-@InProceedings{KellerM2005,
-  author =       "M. Keller and S. Bengio",
-  booktitle =    "Proceedings of the 15th International Conference on
-                 Artificial Neural Networks: Biological Inspirations,
-                 ICANN, Lecture Notes in Computer Science",
-  title =        "A neural network for text representation",
-  volume =       "LNCS 3697",
-  pages =        "667--672",
-  year =         "2005",
-  teditor =      "Springer-Verlag",
-}
-
-@inproceedings{Keller2007,
- author = {Katherine A. Heller and Zoubin Ghahramani}, 
- booktitle =    aistats07,
- year = 2007, 
- title = {A Nonparametric Bayesian Approach to Modeling Overlapping Clusters},
- publisher =    "Omnipress",
- date =         "March 21-24, 2007",
- address =      "San Juan, Porto Rico",
- pages =        "187-194",
-}
-
-@inproceedings{Keller2008,
- author = {Katherine A. Heller and Sinead Williamson and Zoubin Ghahramani}, 
- year = 2008, 
- title = {Statistical models for partial membership}, 
- booktitle = ICML08,
- editor =    ICML08ed,
- publisher = ICML08publ,
- location =  {Helsinki, Finland},
- pages = "392--399",
-}
-
-@Book{Kelly1975,
-  author =       "Edward Kelly and Philip Stone",
-  title =        "Computer recognition of english word senses",
-  publisher =    "North-Holland Linguistics Series",
-  year =         "1975",
-}
-
-@InProceedings{Kemp+al-2004,
-  author =       "C. Kemp and T. L. Griffiths and S. Stromsten and J. B.
-                 Tenembaum",
-  editor =       NIPS16ed,
-  booktitle =    NIPS16,
-  title =        "Semi-supervised learning with trees",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  year =         "2004",
-}
-
-@inproceedings{Kerr2007,
- author = {Wesley Kerr and Shane Hoversten and Daniel Hewlett and Paul R. Cohen and Yu-Han Chang},
- title = {Learning in Wubble World},
- booktitle = {Proceedings of the IEEE Int. Conference on Development and Learning},
- year = 2007,
-}
-
-@Article{Kerszberg90,
-  author =       "M. Kerszberg and A. Zippelius",
-  title =        "Synchronization in Neural Assemblies",
-  journal =      pscrip,
-  volume =       "T33",
-  pages =        "54--64",
-  year =         "1990",
-}
-
-@InProceedings{Keysers2000,
-  author =       "D. Keysers and J. Dahmen and H. Ney",
-  booktitle =    "22nd Symposium of the German Association for Pattern
-                 Recognition",
-  title =        "A probabilistic view on tangent distance",
-  address =      "Kiel, Germany",
-  year =         "2000",
-}
-
-@Book{Khalil92,
-  author =       "Hassan K. Khalil",
-  title =        "Nonlinear Systems",
-  publisher =    "Macmillan Publishing Company",
-  address =      "New York",
-  year =         "1992",
-}
-
-@Book{Kiang65,
-  author =       "N. Y. S. Kiang and T. Watanabe and E. C. Thomas and L.
-                 F. Clark",
-  title =        "Discharge patterns of single fibers in the cat's
-                 auditory nerve fiber",
-  publisher =    "Cambdrige, MA: MIT Press",
-  year =         "1965",
-}
-
-@Article{Kiefer80,
-  author =       "N. M. Kiefer",
-  title =        "A note on switching regressions and logistic
-                 discrimination",
-  journal =      "Econometrica",
-  volume =       "48",
-  pages =        "1065--1069",
-  year =         "1980",
-}
-
-@Misc{Kilgarriff2000,
-  author =       "Adam Kilgarriff and Joseph Rosenzweig",
-  title =        "English {SENSEVAL}: Report and Results",
-  year =         "2000",
-  URL =          "citeseer.nj.nec.com/335615.html",
-  text =         "A. Kilgarriff and J. Rosenzweig. English SENSEVAL:
-                 Report and Results. In Proceedings of the 2nd
-                 International Conference on Language Resources and
-                 Evaluation, LREC, Athens, Greece.",
-}
-
-@InProceedings{Kilgarriff2002,
-  author =       "Adam Kilgarriff",
-  booktitle =    "Proceedings of Senseval-2",
-  title =        "English lexical sample task description",
-  organization = "ACL workshop",
-  year =         "2002",
-}
-
-@Article{Kim94,
-  author =       "C. J. Kim",
-  title =        "Dynamical linear models with Markov-switching",
-  journal =      "Journal of Econometrics",
-  volume =       "60",
-  pages =        "1--22",
-  year =         "1994",
-}
-
-@Article{Kimeldorf-Wahba-71,
-  author =       "G. Kimeldorf and G. Wahba",
-  title =        "Some results on {Tchebychean} spline functions",
-  journal =      "Journal of Mathematics Analysis and Applications",
-  volume =       "33",
-  pages =        "82--95",
-  year =         "1971",
-}
-
-@InCollection{Kinzel90,
-  author =       "W. Kinzel and M. Opper",
-  editor =       "E. Domany and J. L. van Hemmen and K. Schulten",
-  booktitle =    "Physics of Neural Networks",
-  title =        "Dynamics of Learning",
-  volume =       "1",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  year =         "1990",
-}
-
-@inproceedings{Kira+Rendell-1992,
-    author    = {Kenji Kira and Larry A. Rendell},
-    title     = {The Feature Selection Problem: Traditional Methods and a New Algorithm},
-    booktitle = {Proceedings of the Tenth National Conference on Artificial Intelligence},
-    year      = {1992},
-    pages     = {129-134},
-    bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@inproceedings{Kira+Rendell-1992b,
-    address = {San Francisco, CA, USA},
-    author = {Kenji Kira and Larry A. Rendell},
-    booktitle = {Proceedings of the Ninth International Conference on Machine learning},
-    isbn = {15586247X},
-    pages = {249--256},
-    posted-at = {2007-02-07 04:40:40},
-    publisher = {Morgan Kaufmann},
-    title = {A practical approach to feature selection},
-    url = {http://portal.acm.org/citation.cfm?id=142034},
-    year = {1992}
-}
-
-@Book{Kirk70,
-    author =       "D. E. Kirk",
-    title =        "Optimal Control Theory: an Introduction",
-    publisher =    "Prentice Hall",
-    address =      "Englewood Cliffs NJ",
-    year =         "1970",
-}
-
-@Book{Kirk70a,
-  author =       "D. E. Kirk",
-  title =        "Optimal Control Theory: an Introduction",
-  publisher =    "Prentice Hall",
-  address =      "Englewood Cliffs NJ",
-  year =         "1970",
-}
-
-@Article{Kirkpatrick83,
-  author =       "S. Kirkpatrick and C. D. Gelatt Jr. and and M. P.
-                 Vecchi",
-  title =        "Optimization by Simulated Annealing",
-  journal =      science,
-  volume =       "220",
-  pages =        "671--680",
-  year =         "1983",
-}
-
-@Article{Kirkpatrick85,
-  author =       "S. Kirkpatrick and G. Toulouse",
-  title =        "Configuration Space Analysis of Travelling Salesman
-                 Problems",
-  journal =      jpp,
-  volume =       "46",
-  pages =        "1277--1292",
-  year =         "1985",
-}
-
-@Book{kitagawa+gersch96,
-  author =       "G. Kitagawa and W. Gersch",
-  title =        "Smoothness priors analysis of time series",
-  publisher =    "Eds. P. Bickel and P. Diggle and S. Fienberg and K.
-                 Krickeberg and I. Olkin and W. Wermuth and S. Zeger,
-                 Lecture Notes in Statistics, volume 116",
-  year =         "1996",
-}
-
-@Article{kitagawa87,
-  author =       "G. Kitagawa",
-  title =        "Non-{Gaussian} State-Space Modeling on Nonstationary
-                 Time Series",
-  journal =      "Journal of the American Statistical Association",
-  volume =       "82",
-  number =       "400",
-  pages =        "1032--1063",
-  year =         "1987",
-}
-
-@Article{kitagawa96,
-  author =       "G. Kitagawa",
-  title =        "{Monte} {Carlo} Filter and Smoother for Non-{Gaussian}
-                 Nonlinear State Space Models",
-  journal =      "Journal of Computational Graphics and Statistics",
-  volume =       "5",
-  number =       "1",
-  pages =        "1--25",
-  year =         "1996",
-}
-
-@Article{Kivinen02,
-  author =       "J. Kivinen and A. Smola and R. Williamson",
-  title =        "Online Learning with kernels",
-  year =         "2002",
-  URL =          "citeseer.csail.mit.edu/kivinen02online.html",
-  text =         "J. Kivinen, A. Smola, and R. C. Williamson, (2002)
-                 Online Learning with kernels. Advances in Neural
-                 Information Processing Systems 14, Cambridge, MA: MIT
-                 Press (pp. 785-793).",
-}
-
-@InProceedings{Klatt82,
-  author =       "D. Klatt",
-  booktitle =    icassp,
-  title =        "Prediction of perceived phonetic distance from
-                 critical-band spectra: a first step",
-  pages =        "1278--1281",
-  year =         "1982",
-}
-
-@inproceedings{Kleinberg-2003,
-    author = "J. Kleinberg",
-    title = "An impossibility theorem for clustering",
-    editor =       NIPS15ed,
-    booktitle =    NIPS15,
-    publisher =    "MIT Press",
-    address =      "Cambridge, MA",
-    year =         "2003",
-}
-
-@Article{Kleinfeld86,
-  author =       "D. Kleinfeld",
-  title =        "Sequential State Generation by Model Neural Networks",
-  journal =      PNAS,
-  volume =       "83",
-  pages =        "9469--9473",
-  year =         "1986",
-}
-
-@InCollection{Kleinfeld89,
-  author =       "D. Kleinfeld and H. Sompolinsky",
-  editor =       "C. Koch and I. Segev",
-  booktitle =    "Methods in Neuronal Modeling: From Synapses to
-                 Networks",
-  title =        "Associative Network Models for Central Pattern
-                 Generators",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  pages =        "195--246",
-  year =         "1989",
-}
-
-@Book{Klopf82,
-  author =       "A. H. Klopf",
-  title =        "The Hedonistic Neuron: {A} Theory of Memory, Learning,
-                 and Intelligence",
-  publisher =    "Hemisphere",
-  address =      "Washington",
-  year =         "1982",
-}
-
-@InProceedings{Kneser95,
-  author =       "Reinhard Kneser and Hermann Ney",
-  booktitle =    icassp,
-  title =        "Improved Backing-Off for {M}-Gram Language Modeling",
-  pages =        "181--184",
-  year =         "1995",
-}
-
-@Article{Koch86,
-  author =       "C. Koch and J. Marroquin and A. Yuille",
-  title =        "Analog ``Neuronal'' Networks in Early Vision",
-  journal =      PNAS,
-  volume =       "83",
-  pages =        "4263--4267",
-  year =         "1986",
-}
-
-@InProceedings{Koch88,
-  author =       "C. Koch and J. Luo and C. Mead and J. Hutchinson",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Computing Motion Using Resistive Networks",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "422--431",
-  year =         "1988",
-}
-
-@InProceedings{Kohavi95,
-  author =       "Ron Kohavi",
-  booktitle =    "Proceeding of the Fourteenth International Joint
-                 Conference on Artificial Intelligence",
-  title =        "A Study of Cross-Validation and Bootstrap for Accuracy
-                 Estimation and Model Selection",
-  publisher =    "Morgan Kaufmann",
-  pages =        "1137--1143",
-  year =         "1995",
-}
-
-@article{Kohavi+John-1997,
-        address = {Essex, UK},
-        author = {Kohavi, Ron   and John, George  H.},
-        doi = {10.1016/S0004-3702(97)00043-X},
-        issn = {0004-3702},
-        journal = {Artificial Intelligence},
-        number = {1-2},
-        pages = {273--324},
-        publisher = {Elsevier Science Publishers Ltd.},
-        title = {Wrappers for feature subset selection},
-        url = {http://portal.acm.org/citation.cfm?id=270627},
-        volume = {97},
-        year = {1997}
-}
-
-@Article{Kohonen-ieee90,
-  author =       "T. Kohonen",
-  title =        "The Self-Organizing Map",
-  journal =      ieeeproc,
-  volume =       "78",
-  number =       "9",
-  pages =        "1464--1480",
-  year =         "1990",
-  OPTnote =      "Special Issue on Neural Networks",
-}
-
-@Article{Kohonen74,
-  author =       "T. Kohonen",
-  title =        "An Adaptive Associative Memory Principle",
-  journal =      ieeetc,
-  volume =       "C-23",
-  pages =        "444--445",
-  year =         "1974",
-}
-
-@Article{Kohonen82,
-  author =       "T. Kohonen",
-  title =        "Self-Organized Formation of Topologically Correct
-                 Feature Maps",
-  journal =      biocyb,
-  volume =       "43",
-  year =         "1982",
-}
-
-@InProceedings{Kohonen84,
-  author =       "T. Kohonen and K. M{\"a}kisara and T. Saram{\"a}ki",
-  booktitle =    "Proceedings of the Seventh International Conference on
-                 Pattern Recognition",
-  title =        "Phonotopic Maps --- Insightful Representation of
-                 Phonological Features for Speech Recognition",
-  publisher =    "IEEE, New York",
-  address =      "Montreal 1984",
-  pages =        "182--185",
-  year =         "1984",
-}
-
-@TechReport{Kohonen86lvq,
-  author =       "Teuvo Kohonen",
-  title =        "Learning Vector Quantization for Pattern Recognition",
-  type =         "Report",
-  number =       "TKK-F-A601",
-  institution =  "Helsinki University of Technology",
-  address =      "Espoo, Finland",
-  year =         "1986",
-}
-
-@InProceedings{Kohonen88,
-  author =       "T. Kohonen and G. Barna and R. Chrisley",
-  booktitle =    icnn,
-  title =        "Statistical Pattern Recognition with Neural Networks:
-                 Benchmarking Studies",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "61--68",
-  year =         "1988",
-}
-
-@Book{Kohonen89,
-  author =       "T. Kohonen",
-  title =        "Self-Organization and Associative Memory",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  edition =      "3",
-  year =         "1989",
-}
-
-@Book{Kohonen-2001,
-  author =       "T. Kohonen",
-  title =        "Self-Organizing Maps",
-  publisher =    "Springer",
-  edition =      "3",
-  year =         "2001",
-}
-
-@Article{Kolchinskii2000,
-  author =       "V. Koltchinskii and E. Giné",
-  title =        "Random matrix approximation of spectra of integral
-                 operators",
-  journal =      "Bernoulli",
-  volume =       "6",
-  number =       "1",
-  pages =        "113--167",
-  year =         "2000",
-}
-
-@TechReport{Kolen+Pollack90,
-  author =       "J. F. Kolen and J. B. Pollack",
-  key =          "kolen",
-  title =        "Back propagation is sensitive to initial conditions",
-  type =         "Technical Report",
-  number =       "TR 90-{JK}-{BPSIC}",
-  institution =  "The Ohio State University",
-  year =         "1990",
-}
-
-@InProceedings{Kolen-nips94,
-  author =       "John F. Kolen",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Fool's Gold: Extracting Finite State Machines From
-                 Recurrent Network Dynamics",
-  publisher =    "Morgan Kaufmann",
-  year =         "1994",
-}
-
-@Article{Kolmogorov33,
-  author =       "A. N. Kolmogorov",
-  title =        "Sulla determinazione empirica di una leggi di
-                 distribuzione",
-  journal =      "G. Inst. Ital. Attuari",
-  volume =       "4",
-  year =         "1933",
-  note =         "translated in English in {\em Breakthroughs in
-                 Statistics}, by Kotz and Johnson (editors),
-                 Springer-Verlag, 1992",
-}
-
-@Article{Kolmogorov57,
-  author =       "A. N. Kolmogorov",
-  title =        "On the representation of continuous functions of many
-                 variables by superposition of continuous functions of
-                 one variable and addition",
-  journal =      "Kokl. Akad. Nauk USSR",
-  volume =       "114",
-  publisher =    "[translated in: American Mathematical Society
-                 Translations 28 (1963) 55--59]",
-  pages =        "953--956",
-  year =         "1957",
-}
-
-@Article{Kolmogorov65,
-  author =       "A. N. Kolmogorov",
-  title =        "Three approaches to the quantitative definition of
-                 information",
-  journal =      "Problems of Information and Transmission",
-  volume =       "1",
-  number =       "1",
-  pages =        "1--7",
-  year =         "1965",
-}
-
-@InProceedings{Koltchinskii-1998,
-  author =       "V. Koltchinskii",
-  editor =       "Eberlein and Hahn and Talagrand",
-  booktitle =    "Progress in Probability",
-  title =        "Asymptotics of Spectral Projections of Some Random
-                 Matrices Approximating Integral Operators",
-  volume =       "43",
-  publisher =    "Birkhauser",
-  address =      "Basel",
-  pages =        "191--227",
-  year =         "1998",
-}
-
-@InProceedings{Kong95,
-  author =       "Eun Bae Kong and Thomas G. Dietterich",
-  booktitle =    "International Conference on Machine Learning",
-  title =        "Error-Correcting Output Coding Corrects Bias and
-                 Variance",
-  pages =        "313--321",
-  year =         "1995",
-}
-
-@InProceedings{Konig96,
-  author =       "Y. Konig and H. Bourlard and N. Morgan",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "{REMAP}: Recursive Estimation and Maximization of {A}
-                 Posteriori Probabilities -- Application to
-                 transition-based connectionist speech recognition",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@inproceedings{Koray-08,
- title = "Learning Invariant Features through Topographic Filter Maps",
- author = "Kavukcuoglu, Koray and Ranzato, {Marc'Aurelio} and Fergus, Rob and {LeCun}, Yann",
- booktitle = cvpr09,
- publisher = "IEEE",
- year = "2009"
-}
-
-@techreport {koray-psd-08,
- original = "orig/koray-psd-08.pdf",
- title = "Fast Inference in Sparse Coding Algorithms with Applications to Object Recognition",
- author = "Kavukcuoglu, Koray and Ranzato, {Marc'Aurelio} and {LeCun}, Yann",
- institution = "Computational and Biological Learning Lab, Courant Institute, NYU",
- note = "Tech Report CBLL-TR-2008-12-01",
- year = "2008"
-}
-
-@article{Kouh-Poggio-2008,
-  author = {Minjoon M. Kouh and Tomaso T. Poggio},
-  title = {A Canonical Neural Circuit for Cortical Nonlinear Operations},
-  journal = {Neural Computation},
-  volume = 20,
-  number={6},
-  pages = {1427--1451},
-  year = 2008,
-}
-
-@TechReport{Kouropteva+al-2002,
-    author =       {O. Kouropteva and O. Okun and A. Hadid and M. Soriano and S. Marcos and M. Pietik{\"a}inen},
-    title =        {Beyond locally linear embedding algorithm},
-    number =       {MVG-01-2002},
-    institution =  {Department of Electrical and Information Engineering, University of Oulu},
-    address =      {Oulu, Finland},
-    year =         2002,
-}
-
-@inproceedings{Kononenko-1994,
-    author = {Kononenko, Igor},
-    booktitle = ECML94,
-    pages = {171--182},
-    editor = {F. Bergadano and L. D. Raedt},
-    title = {Estimating Attributes: Analysis and Extensions of RELIEF},
-    url = {http://citeseer.ist.psu.edu/kononenko94estimating.html},
-    year = {1994}
-}
-
-@InProceedings{Kozma96,
-  author =       "R. Kozma and M. Kitamura and S. Sato",
-  booktitle =    nipc-hmit96,
-  title =        "Monitoring of {NPP} State using Structural Adaptation
-                 in a Neural Signal Processing System",
-  volume =       "1",
-  publisher =    ans,
-  pages =        "273--278",
-  year =         "1996",
-}
-
-@Article{Kramer1991,
-  author =       "Mark Kramer",
-  title =        "Nonlinear Principal Component Analysis Using
-                 Autoassociative Neural Network",
-  journal =      "AIChE Journal",
-  volume =       "34",
-  pages =        "233--243",
-  year =         "1991",
-}
-
-@InProceedings{Kramer89,
-  author =       "A. H. Kramer and A. Sangiovanni-Vincentelli",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Efficient Parallel Learning Algorithms for Neural
-                 Networks",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "40--48",
-  year =         "1989",
-}
-
-@Article{Krauth89,
-  author =       "W. Krauth and M. M\'ezard",
-  title =        "The Cavity Method and the Travelling-Salesman
-                 Problem",
-  journal =      eul,
-  volume =       "8",
-  pages =        "213--218",
-  year =         "1989",
-}
-
-@Book{Kreyszig90,
-  author =       "E. Kreyszig",
-  title =        "Introductory Functional Analysis with Applications",
-  publisher =    "John Wiley \& Sons, Inc.",
-  address =      "New York, NY",
-  year =         "1990",
-}
-
-@Book{Krishnaiah82,
-  editor =       "P. R. Krishnaiah and L. N. Kanal",
-  title =        "Classification, Pattern Recognition, and Reduction of
-                 Dimensionality",
-  volume =       "2",
-  publisher =    "North Holland",
-  address =      "Amsterdam",
-  year =         "1982",
-  series =       "Handbook of Statistics",
-}
-
-@techreport{KrizhevskyHinton2009,
-    author={Alex Krizhevsky and Geoffrey Hinton},
-    title = {Learning Multiple Layers of Features from Tiny Images},
-    year = 2009,
-    chapter=3,
-    institution={University of Toronto}
-}
-
-@InProceedings{Krogh-nips8,
-  author =       "A. Krogh and S. K. Riis",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Prediction of beta sheets in proteins",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "917--923",
-  year =         "1996",
-}
-
-@Article{Krogh88,
-  author =       "A. Krogh and J. A. Hertz",
-  title =        "Mean Field Analysis of Hierarchical Associative
-                 Networks with Magnetization",
-  journal =      jpa,
-  volume =       "21",
-  pages =        "2211--2224",
-  year =         "1988",
-}
-
-@InProceedings{Krogh90a,
-  author =       "A. Krogh and G. I. Thorbergsson and J. A. Hertz",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "A Cost Function for Internal Representations",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "733--740",
-  year =         "1990",
-}
-
-@InProceedings{Krogh90b,
-  author =       "A. Krogh and J. A. Hertz",
-  editor =       "R. Eckmiller and G. Hartmann and G. Hauske",
-  booktitle =    "Parallel Processing in Neural Systems and Computers",
-  title =        "Hebbian Learning of Principal Components",
-  publisher =    "Elsevier, Amsterdam",
-  address =      "D{\"u}sseldorf 1990",
-  pages =        "183--186",
-  year =         "1990",
-}
-
-@Article{Krogh94,
-  author =       "A. Krogh and M. Brown and I. S. Mian and K. Sjölander
-                 and D. Haussler",
-  title =        "Hidden Markov models in computational biology:
-                 Applications to protein modeling",
-  journal =      "Journal Molecular Biology",
-  volume =       "235",
-  pages =        "1501--1531",
-  year =         "1994",
-}
-
-@InProceedings{Krogh95,
-  author =       "A. Krogh and J. Vedelsby",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        "Neural network ensembles, cross validation and active
-                 learning",
-  publisher =    "Cambridge MA: MIT Press",
-  pages =        "231--238",
-  year =         "1995",
-}
-
-@Book{Krolzig97,
-  author =       "H.-M. Krolzig",
-  title =        "Markov-Switching Vector Autoregressions",
-  publisher =    "Springer",
-  year =         "1997",
-}
-
-@article{Krueger+Dayan-2009,
- author = {Kai A. Krueger and Peter Dayan},
- title = {Flexible shaping: how learning in small steps helps},
- journal = {Cognition},
- volume = 110,
- year = 2009,
- pages = {380--394},
-}
-
-@Article{Ku92,
-  author =       "C. C. Ku and K. Y. Lee and R. M. Eawards",
-  title =        "Improved Nuclear Reactor Temperature Control Using
-                 Diagonal Recurrent Neural Networks",
-  journal =      "IEEE Transactions on Nuclear Science",
-  volume =       "39",
-  pages =        "2292--2308",
-  year =         "1992",
-}
-
-@InProceedings{Kubala94,
-  author =       "F. Kubala and A. Anastasakos and J. Makhoul and L.
-                 Nguyen and R. Schwartz and G. Zavaliagkos",
-  booktitle =    icassp,
-  title =        "Comparative experiments on large vocabulary speech
-                 recognition",
-  address =      "Adelaide, Australia",
-  pages =        "561--564",
-  year =         "1994",
-}
-
-@InProceedings{Kuhn+Herzberg90,
-  author =       "G. Kuhn and N. Herzberg",
-  booktitle =    "Proc. 24th Conference on Information Sciences and
-                 Systems",
-  title =        "Variations on training of recurrent networks",
-  organization = "Princeton University",
-  address =      "NJ",
-  year =         "1990",
-}
-
-@Unpublished{Kuhn87,
-  author =       "G. Kuhn",
-  title =        "A first look at phonetic discrimination using
-                 connectionist models with recurrent links",
-  year =         "1987",
-  note =         "CCRP -- IDA SCIMP working paper No.4/87, Institute for
-                 Defense Analysis, Princeton, NJ",
-}
-
-@Article{Kuhn-et-al-90,
-  author =       "G. Kuhn and R. L. Watrous and B. Ladendorf",
-  title =        "Connected recognition with a recurrent network",
-  journal =      spcomm,
-  volume =       "9",
-  pages =        "41--49",
-  year =         "1990",
-  OPTnote =      "",
-}
-
-@Book{Kullback59,
-  author =       "S. Kullback",
-  title =        "Information Theory and Statistics",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1959",
-}
-
-@Book{Kumar+al-1994,
-  author =       "V. Kumar and A. Grama and A. Gupta and G. Karypis",
-  title =        "Introduction to Parallel Computing: Design and
-                 Analysis of Algorithms",
-  publisher =    "Benjamin Cummings",
-  address =      "Redwood City, CA",
-  year =         "1994",
-}
-
-@Article{Kumar+al-1994b,
-  author =       "Vipin Kumar and Shashi Shekhar and Minesh B. Amin",
-  title =        "A Scalable Parallel Formulation of the Backpropagation
-                 Algorithm for Hypercubes and Related Architectures",
-  journal =      "IEEE Transactions on Parallel and Distributed
-                 Systems",
-  volume =       "5",
-  number =       "10",
-  pages =        "1073--1090",
-  year =         "1994",
-}
-
-@InProceedings{Kundu88,
-  author =       "A. Kundu and L. R. Bahl",
-  booktitle =    icassp,
-  title =        "Recognition of handwritten script: a hidden {Markov}
-                 model based approach",
-  address =      "New York, NY",
-  pages =        "928--931",
-  year =         "1988",
-}
-
-@Article{Kuperstein88,
-  author =       "M. Kuperstein",
-  title =        "Neural model of adaptive hand-eye coordination for
-                 single postures",
-  journal =      "Science",
-  volume =       "239",
-  pages =        "1308--1311",
-  year =         "1988",
-}
-
-@Article{Kurkova95,
-  author =       "V. Kurkov\'a",
-  title =        "Approximation of functions by perceptron networks with
-                 bounded number of hidden units",
-  journal =      "Neural Networks",
-  volume =       "8",
-  pages =        "745--750",
-  year =         "1995",
-}
-
-@Book{Kushner78,
-  author =       "H. J. Kushner and D. S. Clark",
-  title =        "Stochastic Approximation Methods for Constrained and
-                 Unconstrained Systems",
-  publisher =    "Springer-Verlag",
-  address =      "New York",
-  year =         "1978",
-}
-
-@InProceedings{Kwok-Tsang-2003,
-  author =       "J. T. Kwok and I. W. Tsang",
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  title =        "Learning with idealized kernels",
-  pages =        "400--407",
-  year =         "2003",
-}
-
-@InProceedings{Laaksonen97,
-  author =       "Jorma Laaksonen",
-  booktitle =    "Proceedngs of the International Conference on
-                 Artificial Neural Networks ICANN'97",
-  title =        "Local Subspace Classifier",
-  pages =        "637--642",
-  year =         "1997",
-  URL =          "http://www.cis.hut.fi/jorma/papers/abstracts.html#icann97",
-}
-
-@InProceedings{Lafferty-icml2001,
-  author =       "John Lafferty and Andrew McCallum and Fernando C. N. Pereira",
-  booktitle =    ICML01,
-  editor =       ICML01ed,
-  publisher =    ICML01publ,
-  title =        "Conditional Random Fields: Probabilistic Models for
-                 Segmenting and Labeling Sequence Data",
-  year =         "2001",
-}
-
-@article{Lai+Fyfe-2000,
-    author = {P. L. Lai and C. Fyfe},
-    title = {Kernel and Nonlinear Canonical Correlation Analysis},
-    journal = {International Journal of Neural Systems},
-    year = {2000},
-    pages = {365--377},
-    volume = 10,
-    number = 5,
-}
-
-@InProceedings{Laj92,
-  author =       "E. Laj and A. Paoloni",
-  editor =       "M. Gori",
-  booktitle =    "Proc. of the Second Workshop on Neural Networks for
-                 Speech Processing",
-  title =        "{AIDA}: The Italian Corpora",
-  publisher =    "LINT",
-  address =      "Firenze (Italy)",
-  pages =        "179--183",
-  year =         "1992",
-}
-
-@InProceedings{Lanckriet-2002,
-  author =       "G. Lanckriet and N. Cristianini and P. Bartlett and L.
-                 {El Gahoui} and M. Jordan",
-  booktitle =    ICML02,
-  editor =       ICML02ed,
-  publisher =    ICML02publ,
-  title =        "Learning the kernel matrix with semi-definite
-                 programming",
-  pages =        "323--330",
-  year =         "2002",
-}
-
-@Article{Lanckriet2004,
-  author =       "Gert R. G. Lanckriet and Nello Cristianini and Peter
-                 Bartlett and Laurent El Ghaoui and Michael I. Jordan",
-  title =        "Learning the Kernel Matrix with Semidefinite
-                 Programming",
-  journal =      jmlr,
-  volume =       "5",
-  pages =        "27--72",
-  year =         "2004",
-}
-
-@TechReport{Lang+Hinton88,
-  author =       "K. J. Lang and G. E. Hinton",
-  title =        "The development of the Time-Delay Neural Network
-                 architecture for speech recognition",
-  number =       "CMU-CS-88-152",
-  institution =  "Carnegie-Mellon University",
-  year =         "1988",
-}
-
-@Article{Langdell-00-nips,
-  author =       "S. Langdell and Y. Bengio",
-  title =        "Approximate {SVM} Solutions: a Datamining Tool",
-  journal =      "submitted to NIPS'2000",
-  year =         "2000",
-}
-
-@InProceedings{Langford+Zadrozny-2005,
-  author =       "John Langford and Bianca Zadrozny",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "Estimating Class Membership Probabilities using
-                 Classifier Learners",
-  publisher =    "Society for Artificial Intelligence and Statistics",
-  pages =        "198--205",
-  year =         "2005",
-}
-
-@Article{Lapedes86a,
-  author =       "A. Lapedes and R. Farber",
-  title =        "A Self-Optimizing, Nonsymmetrical Neural Net for
-                 Content Addressable Memory and Pattern Recognition",
-  journal =      physicaD,
-  volume =       "22",
-  pages =        "247--259",
-  year =         "1986",
-}
-
-@InProceedings{Lapedes86b,
-  author =       "A. Lapedes and R. Farber",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "Programming a Massively Parallel, Computation
-                 Universal System: Static Behavior",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "283--298",
-  year =         "1986",
-}
-
-@TechReport{Lapedes87,
-  author =       "A. Lapedes and R. Farber",
-  title =        "Nonlinear Signal Processing Using Neural Networks:
-                 Prediction and System Modelling",
-  number =       "LA--UR--87--2662",
-  institution =  "Los Alamos National Laboratory",
-  address =      "Los Alamos, NM",
-  year =         "1987",
-}
-
-@InProceedings{Lapedes88,
-  author =       "A. Lapedes and R. Farber",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "How Neural Nets Work",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "442--456",
-  year =         "1988",
-}
-
-@Article{Lari90,
-  author =       "K. Lari and S. J. Young",
-  title =        "The estimation of stochastic context-free grammars
-                 using the Inside-Outside algorithm",
-  journal =      cspla,
-  volume =       "4",
-  pages =        "35--56",
-  year =         "1990",
-}
-
-@inproceedings{Tieleman08,
-    author = {Tijmen Tieleman},
-    title = {Training restricted Boltzmann machines using approximations to the likelihood gradient},
-    booktitle = ICML08,
-    editor =    ICML08ed,
-    publisher = ICML08publ,
-    location = {Helsinki, Finland},
-    year = {2008},
-    pages = {1064--1071}
-}
-
-@InProceedings{TielemanT2009,
- author =    {Tijmen Tieleman and Geoffrey Hinton},
- title =     {Using Fast Weights to Improve Persistent Contrastive Divergence},
- booktitle = ICML09,
- editor =    ICML09ed,
- publisher = ICML09publ,
- year =      "2009",
- isbn =      {978-1-60558-516-1},
- pages =     {1033--1040},
- location =  icml09loc,
- doi =       {http://doi.acm.org/10.1145/1553374.1553506},
-}
-
-@article{Larochelle-jmlr-toappear-2008,
- author = {Hugo Larochelle and Yoshua Bengio and Jerome Louradour and Pascal Lamblin},
- title = {Exploring Strategies for Training Deep Neural Networks},
- journal = jmlr,
- year = 2009,
- volume = 10,
- pages = {1--40},
-}
-
-@InProceedings{LarochelleH2007-small,
-  author =       "H. Larochelle and D. Erhan and A. Courville and
-                 J. Bergstra and Y. Bengio",
-  booktitle =    "ICML 2007",
-  title =        "An Empirical Evaluation of Deep Architectures on
-                 Problems with Many Factors of Variation",
-  year =         "2007",
-}
-
-@InProceedings{LarochelleH2007-short,
-  author =       "H. Larochelle and D. Erhan and A. Courville and
-                 J. Bergstra and Y. Bengio",
-  booktitle =    "Int. Conf. Mach. Learn.",
-  title =        "An Empirical Evaluation of Deep Architectures on
-                 Problems with Many Factors of Variation",
-  year =         "2007",
-  pages =        "473--480",
-}
-
-%I deprecate the following one as this is a duplicate of LarochelleH2007
-@InProceedings{larochelle-icml-2007,
-  author =       "Hugo Larochelle and Dumitru Erhan and Aaron Courville
-                 and James Bergstra and Yoshua Bengio",
-  booktitle =    ICML07,
-  editor =       ICML07ed,
-  publisher =    ICML07publ,
-  title =        "An Empirical Evaluation of Deep Architectures on
-                 Problems with Many Factors of Variation",
-  pages =        "473--480",
-  location =     "Corvallis, OR",
-  year =         "2007",
-}
-  %url =          "http://www.machinelearning.org/proceedings/icml2007/papers/331.pdf",
-
-%I deprecate the following one as this is a duplicate of LarochelleH2007
-@Article{larochelle:icml07,
-  author =       "Hugo Larochelle and Dumitru Erhan and Aaron Courville and
-                 James Bergstra and Yoshua Bengio",
-  booktitle =    ICML07,
-  editor =       ICML07ed,
-  publisher =    ICML07publ,
-  title =        "An empirical evaluation of deep architectures on
-                 problems with many factors of variation",
-  pages =        "473--480",
-  year =         "2007",
-  location =     "Corvallis, OR",
-  url =          "http://www.machinelearning.org/proceedings/icml2007/papers/331.pdf",
-}
-
-@inproceedings{Larochelle+Bengio-2008-small,
-    author = "Hugo Larochelle and Yoshua Bengio",
-    title = {Classification using Discriminative Restricted {Boltzmann} Machines},
-    booktitle = {Proceedings of ICML 2008},
-    year = {2008},
-    pages = {536--543}
-}
-
-@InCollection{Larsen98,
-  author =       "Jan Larsen and Claus Svarer and Lars Nonboe Andersen
-                 and Lars Kai Hansen",
-  editor =       "G. B. Orr and K-R. Muller",
-  booktitle =    "Neural Networks: Tricks of he Trade",
-  title =        "Adaptive Regularization in Neural Networks Modeling",
-  publisher =    "Springer",
-  pages =        "113--132",
-  year =         "1998",
-}
-
- 
-@InProceedings{LasserreJ2006,
-  author =       "Julia A. Lasserre and Christopher M. Bishop and
-                 Thomas P. Minka",
-  booktitle =    cvpr06,
-  title =        "Principled Hybrids of Generative and Discriminative
-                 Models",
-  publisher =    "IEEE Computer Society",
-  address =      "Washington, DC, USA",
-  pages =        "87--94",
-  year =         "2006",
-  ISBN =         "0-7695-2597-0",
-  doi =          "http://dx.doi.org/10.1109/CVPR.2006.227",
-}
-
-
-@TechReport{Laub2003,
-  author =       "J. Laub and K.-R. M{\"u}ller",
-  title =        "Feature discovery: unraveling hidden structure in
-                 non-metric pairwise data",
-  institution =  "Fraunhofer FIRST.IDA",
-  address =      "Germany",
-  year =         "2003",
-}
-
-@Article{Lauritzen95,
-  author =       "Steffen L. Lauritzen",
-  title =        "The {EM} algorithm for graphical association models
-                 with missing data",
-  journal =      "Computational Statistics and Data Analysis",
-  volume =       "19",
-  pages =        "191--201",
-  year =         "1995",
-}
-
-@Book{Lauritzen96,
-  author =       "Steffen L. Lauritzen",
-  title =        "Graphical Models",
-  publisher =    "Clarendon Press",
-  address =      "Oxford",
-  year =         "1996",
-  ISBN =         "0-19-852219-3",
-}
-
-@Book{Lawler76,
-  author =       "E. L. Lawler",
-  title =        "Combinatorial Optimization: Networks and Matroids",
-  publisher =    "Holt-Rinehart-Winston",
-  address =      "New York",
-  year =         "1976",
-}
-
-@Book{Lawler85,
-  editor =       "E. L. Lawler and J. K. Lenstra and A. H. G. Rinnooy
-                 Kan and D. B. Shmoys",
-  title =        "The Travelling Salesman Problem",
-  publisher =    "Wiley",
-  address =      "Chichester",
-  year =         "1985",
-}
-
-@InProceedings{Lawrence-Seeger-Herbrich-2003,
-  author =       "Neil Lawrence and Matthias Seeger and Ralf Herbrich",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Fast Sparse {G}aussian Process Methods: The Informative
-                 Vector Machine",
-  publisher =    "{MIT} Press",
-  pages =        "609--616",
-  year =         "2003",
-}
-
-@InCollection{Lawrence00,
-  author =       "S. Lawrence and S. Fong and C. L. Giles",
-  title =        "Natural Language Grammatical Inference with Recurrent
-                 Neural Networks",
-  journal =      "IEEE Trans. on Knowledge and Data Engineering",
-  pages =        "",
-  year =         "2000",
-}
-
-@InCollection{Lawrence96,
-  author =       "S. Lawrence and S. Fong and C. L. Giles",
-  editor =       "S. Wermter and E. Riloff and G. Scheler",
-  booktitle =    "Lecture Notes on Artificial Intelligence,
-                 Connectionist, Statistical and Symbolic Approaches to
-                 Learning for Natural Language Processing",
-  title =        "Natural Language Grammatical Inference: {A} Comparison
-                 of Recurrent Neural Networks and Machine Learning
-                 Methods",
-  publisher =    "Springer-Verlag, NY",
-  year =         "1996",
-}
-
-@InCollection{LawrenceN2005,
-  author =       "Neil D. {Lawrence} and Michael I. {Jordan}",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Semi-supervised Learning via {G}aussian Processes",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "753--760",
-  year =         "2005",
-  original =     "0753-257.PDF",
-}
-
-@TechReport{LeBaron95,
-  author =       "B. LeBaron and A. S. Weigend",
-  title =        "Evaluating Neural Network Predictors by
-                 Bootstrapping",
-  number =       "CU-CS-725-94",
-  institution =  "University of Colorado, Boulder",
-  year =         "1995",
-}
-
-@Article{LeCun+98,
-  author =       "Yann {LeCun} and Leon Bottou and Yoshua Bengio and
-                 Patrick Haffner",
-  title =        "Gradient-Based Learning Applied to Document
-                 Recognition",
-  journal =      "Proceedings of the {IEEE}",
-  volume =       "86",
-  number =       "11",
-  pages =        "2278--2324",
-  month =        nov,
-  year =         "1998",
-}
-
-@InCollection{LeCun+98backprop,
-  author =       "Yann {LeCun} and L\'{e}on Bottou and Genevieve B. Orr
-                 and Klaus-Robert M{\"{u}}ller",
-  title =        "Efficient Backprop",
-  booktitle =    "Neural Networks, Tricks of the Trade",
-  series =       "Lecture Notes in Computer Science LNCS~1524",
-  publisher =    "Springer Verlag",
-  year =         "1998",
-}
-  %URL =          "http://leon.bottou.org/papers/lecun-98x",
-
-
-@InCollection{LeCun+98backprop-small,
-  author =       "Y. {LeCun} and L. Bottou and G. B. Orr
-                 and K. M{\"{u}}ller",
-  title =        "Efficient Backprop",
-  booktitle =    "Neural Networks, Tricks of the Trade",
-  year =         "1998",
-}
-
-
-@InProceedings{lecun-04,
-  author =       "Yann {LeCun} and Fu-Jie Huang and L{\'e}on Bottou",
-  booktitle =    cvpr04,
-  title =        "Learning Methods for Generic Object Recognition with
-                 Invariance to Pose and Lighting",
-  volume = {2},
-  year =         "2004",
-  issn = {1063-6919},
-  pages = {97-104},
-  doi = {http://doi.ieeecomputersociety.org/10.1109/CVPR.2004.144},
-  publisher = {IEEE Computer Society},
-  address = {Los Alamitos, CA, USA},
-}
-
-@InProceedings{LeCun-cp89,
-  author =       "Yann {LeCun}",
-  booktitle =    "Connectionism in Perspective",
-  title =        "Generalization and Network Design Strategies",
-  publisher =    "Elsevier Publishers",
-  year =         "1989",
-}
-
-@InCollection{LeCun-dsbo86,
-  author =       "Yann {LeCun}",
-  editor =       "F. Fogelman-Souli\'e and E. Bienenstock and G.
-                 Weisbuch",
-  booktitle =    "Disordered Systems and Biological Organization",
-  title =        "Learning Processes in an Asymmetric Threshold
-                 Network",
-  publisher =    "Springer-Verlag",
-  address =      "Les Houches, France",
-  pages =        "233--240",
-  year =         "1986",
-}
-
-@InProceedings{lecun-huang-05,
-  author =       "Yann {LeCun} and {Fu Jie} Huang",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "Loss Functions for Discriminative Training of
-                 Energy-Based Models",
-  date =         "Jan 6-8, 2005",
-  location =     "Savannah Hotel, Barbados",
-  year =         "2005",
-}
-
-@Misc{LeCun-nips93-tutorial,
-  author =       "Yann {LeCun}",
-  title =        "Efficient learning and second-order methods",
-  year =         "1993",
-  note =         "Tutorial presented at NIPS'93, Denver, CO",
-}
-
-@PhdThesis{Lecun-these87,
-  author =       "Yann {LeCun}",
-  title =        "Mod\`eles connexionistes de l'apprentissage",
-  school =       "Universit\'e de Paris VI",
-  year =         "1987",
-}
-
-@InCollection{lecun2006,
-  author =       "Yann {LeCun} and Sumit Chopra and Raia Hadsell and
-                 Marc-Aurelio Ranzato and Fu-Jie Huang",
-  editor =       "G. Bakir and T. Hofman and B. Scholkopf and A. Smola
-                 and B. Taskar",
-  booktitle =    "Predicting Structured Data",
-  title =        "A Tutorial on Energy-Based Learning",
-  publisher =    "MIT Press",
-  pages =        "191--246",
-  year =         "2006",
-}
-
-@InProceedings{LeCun85,
-  author =       "Yann {LeCun}",
-  booktitle =    "Cognitiva 85: A la Fronti\`ere de l'Intelligence
-                 Artificielle, des Sciences de la Connaissance et des
-                 Neurosciences",
-  title =        "Une Proc\'edure d'Apprentissage pour {R}\'eseau \`a
-                 Seuil Assym\'etrique",
-  publisher =    "CESTA, Paris",
-  address =      "Paris 1985",
-  pages =        "599--604",
-  year =         "1985",
-}
-
-@InCollection{LeCun86,
-  author =       "Yann {LeCun}",
-  editor =       "E. Bienenstock and F. Fogelman-Souli\'e and G.
-                 Weisbuch",
-  booktitle =    "Disordered Systems and Biological Organization",
-  title =        "Learning Processes in an Asymmetric Threshold
-                 Network",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Les Houches 1985",
-  pages =        "233--240",
-  year =         "1986",
-}
-
-@Article{LeCun89,
-  author =       "Yann {LeCun} and Bernhard Boser and John S. Denker and Donnie
-                 Henderson and Richard E. Howard and Wayne Hubbard and Lawrence D.
-                 Jackel",
-  title =        "Backpropagation Applied to Handwritten Zip Code
-                 Recognition",
-  journal =      nc,
-  volume =       "1",
-  number =       "4",
-  pages =        "541--551",
-  year =         "1989",
-}
-
-@TechReport{LeCun89a,
-  author =       "Yann {LeCun}",
-  key =          "LeCun",
-  title =        "Generalization and Network Design Strategies",
-  type =         "Technical Report",
-  number =       "CRG-TR-89-4",
-  institution =  "University of Toronto",
-  year =         "1989",
-}
-
-@Article{LeCun89d,
-  author =       "Yann {LeCun} and Lawrence D. Jackel and B. Boser and J.
-                 S. Denker and Hans P. Graf and I. Guyon and D.
-                 Henderson and R. E. Howard and W. Hubbard",
-  title =        "Handwritten Digit recognition: Applications of Neural
-                 Network Chips and Automatic Learning",
-  journal =      "IEEE Communications Magazine",
-  volume =       "27",
-  number =       "11",
-  pages =        "41--46",
-  month =        nov,
-  year =         "1989",
-}
-
-@InProceedings{LeCun90a,
-  author =       "Y. {LeCun} and B. Boser and J. S. Denker and D.
-                 Henderson and R. E. Howard and W. Hubbard and L. D.
-                 Jackel",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Handwritten Digit Recognition with a Back-Propagation
-                 Network",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "396--404",
-  year =         "1990",
-}
-
-@InProceedings{LeCun90b,
-  author =       "Y. {LeCun} and J. S. Denker and S. A. Solla",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Optimal Brain Damage",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "598--605",
-  year =         "1990",
-}
-
-@InProceedings{LeCun90c,
-  author =       "Y. LeCun and Y. Matan and B. Boser and J. S. Denker
-                 and D. Henderson and R. E. Howard and W. Hubbard and L.
-                 D. Jackel and H. S. Baird",
-  editor =       "IAPR",
-  booktitle =    "International Conference on Pattern Recognition",
-  title =        "Handwritten Zip Code Recognition with Multilayer
-                 Networks",
-  publisher =    "IEEE",
-  address =      "Atlantic City",
-  year =         "1990",
-}
-
-@InProceedings{LeCun91,
-  author =       "Y. {LeCun} and I. Kanter and S. Solla",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "Second order properties of error surfaces: learning
-                 time, generalization",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "918--924",
-  year =         "1991",
-}
-
-@InCollection{LeCun93,
-  author =       "Y. {LeCun} and P. Simard and B. Pearlmutter",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Automatic learning rate maximization by on-line
-                 estimation of the {Hessian}'s eigenvectors",
-  publisher =    "Morgan Kaufmann Publishers, San Mateo, CA",
-  pages =        "156--163",
-  year =         "1993",
-}
-
-@InProceedings{LeCun94b,
-  author =       "Yann LeCun and Yoshua Bengio",
-  editor =       "IEEE",
-  booktitle =    ICPR94,
-  title =        "Word-Level Training of a Handritten Word Recognizer
-                 based on Convolutional Neural Networks",
-  address =      "Jerusalem 1994",
-  year =         "1994",
-}
-
-@Article{LeCun98-small,
-  author =       "Y. {LeCun} and L. Bottou and Y. Bengio and
-                 P. Haffner",
-  title =        "Gradient Based Learning Applied to Document
-                 Recognition",
-  journal =      "IEEE",
-  volume =       "86",
-  number =       "11",
-  pages =        "2278--2324",
-  month =        nov,
-  year =         "1998",
-}
-
-@InCollection{LeCun98-tricks,
-  author =       "Y. {LeCun} and L. Bottou and G. B. Orr and K.-R.
-                 M{\"u}ller",
-  editor =       "G. B. Orr and K.-R. M{\"u}ller",
-  booktitle =    "Neural Networks: Tricks of the Trade",
-  title =        "Efficient {BackProp}",
-  publisher =    "Springer",
-  pages =        "9--50",
-  year =         "1998",
-}
-
-@TechReport{LeCun-TR,
-  author =       "Yann {LeCun}",
-  key =          "Lecun",
-  title =        "Generalization and Network Design Strategies",
-  number =       "CRG-TR-89-4",
-  institution =  "Department of Computer Science, University of
-                 Toronto",
-  year =         "1989",
-}
-
-@Article{Lee+Hon89,
-  author =       "Kai-Fu Lee and Hsiao-Wuen Hon",
-  title =        "Speaker-independent phone recognition using hidden
-                 {Markov} models",
-  journal =      "IEEE Trans. on Acoustics, Speech and Signal
-                 Processing",
-  volume =       "37",
-  number =       "11",
-  pages =        "1641--1648",
-  month =        nov,
-  year =         "1989",
-}
-
-@Article{Lee+Lewicki-2002,
-  author =       "T-W. Lee and M. S. Lewicki",
-  title =        "Unsupervised classification segmentation and
-                 enhancement of images using {ICA} mixture models",
-  journal =      "IEEE Trans. Image Proc.",
-  volume =       "11",
-  number =       "3",
-  pages =        "270--279",
-  year =         "2002",
-}
-
-@InCollection{Lee-2008,
-  author =       "Honglak Lee and Chaitanya Ekanadham and Andrew Ng",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Sparse deep belief net model for visual area {V}2",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages = {873--880},
-  year =         "2008",
-}
-
-@Book{Lee91,
-  author =       "Kai-Fu Lee",
-  title =        "Automatic Speech Recognition: the development of the
-                 {SPHINX} system",
-  publisher =    "Kluwer Academic Publ.",
-  year =         "1989",
-}
-
-@article{Lee-1996,
-    author = "Tai Sing Lee",
-    title = "Image Representation Using {2D} {Gabor} Wavelets",
-    journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence",
-    volume = "18",
-    number = "10",
-    pages = "959-971",
-    year = "1996",
-}
-
-@InProceedings{Lee99a,
-  author =       "Lillian Lee",
-  booktitle =    "ACL99",
-  title =        "Measures of Distributional Similarity",
-  pages =        "25--32",
-}
-
-@InProceedings{Lee99b,
-  author =       "Lillian Lee and Fernando Pereira",
-  title =        "Distributional Similarity Models: Clustering vs.
-                 Nearest Neighbours",
-  booktitle =    "ACL99",
-  pages =        "33--40",
-}
-
-@article{Lee+Mumford-2003,
- author = {Tai-Sing Lee and David Mumford},
- title = {Hierarchical Bayesian inference in the visual cortex},
- year = 2003,  
- journal = {Journal of Optical Society of America, A},
- volume = 20,
- number = 7,
- pages = {1434--1448},
-}
-
-
-@Article{Leitch91,
-  author =       "G. Leitch and J. E. Tanner",
-  title =        "Economic Forecast Evaluation: Profits Versus The
-                 Conventional Error Measures",
-  journal =      "The American Economic Review",
-  pages =        "580--590",
-  year =         "1991",
-}
-
-@Article{Lengelle+Denoeux96,
-  author =       "R{\'e}gis Lengell{\'e} and Thierry Denoeux",
-  title =        "Training {MLP}s layer by layer using an objective
-                 function for internal representations",
-  journal =      "Neural Networks",
-  volume =       "9",
-  pages =        "83--97",
-  year =         "1996",
-}
-
-@InProceedings{Leprieur95,
-  author =       "H. Leprieur and P. Haffner",
-  booktitle =    "EUROSPEECH'95",
-  title =        "Discriminant learning with minimum memory loss for
-                 improved non-vocabulary rejection",
-  address =      "Madrid, Spain",
-  year =         "1995",
-}
-
-@Book{lerdahl+jackendoff-1983,
-  author =       "F. Lerdahl and R. Jackendoff",
-  title =        "A {Generative} {Theory} of {Tonal} {Music}",
-  publisher =    "MIT Press",
-  address =      "Cambridge, Mass.",
-  year =         "1983",
-}
-
-@InCollection{LeRoux+al-tonga-2008,
-  author =       "Nicolas {Le Roux} and Pierre-Antoine Manzagol and
-                 Yoshua Bengio",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Topmoumoute online natural gradient algorithm",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =        "849--856",
-  year =         "2008",
-}
-
-@InCollection{LeRoux+al-tonga-2008-small,
-  author =       "Nicolas {Le Roux} and Pierre-Antoine Manzagol and
-                 Yoshua Bengio",
-  booktitle =    "NIPS 20",
-  title =        "Topmoumoute online natural gradient algorithm",
-  pages =        "849--856",
-  year =         "2008",
-}
-
-@TechReport{LeRoux-comb-dens-2005,
-  author =       "Nicolas {Le Roux} and Yoshua Bengio and R\'ejean
-                 Ducharme",
-  title =        "Combining density estimators to improve classification
-                 accuracy",
-  number =       "1261",
-  institution =  "D\'epartement d'informatique et recherche
-                 op\'erationnelle, Universit\'e de Montr\'eal",
-  year =         "2005",
-}
-
-@InProceedings{LeRoux-continuous-short,
-  author =       "Nicolas Le Roux and Yoshua Bengio",
-  booktitle =    aistats07,
-  title =        "Continuous Neural Networks",
-  year =         "2007",
-  date =         "March 21-24, 2007",
-}
-
-@InProceedings{Lesk1986,
-  author =       "Michael E. Lesk",
-  booktitle =    "SIGDOC Conference",
-  title =        "Automatic sense disambiguation using machine readable
-                 dictionaries: How to tell a pine cone from an ice cream
-                 cone.",
-  address =      "Toronto, Canada",
-  year =         "1980",
-}
-
-@InProceedings{Leung92,
-  author =       "H. C. Leung and I. L. Hetherington and V. W. Zue",
-  booktitle =    icassp,
-  title =        "Speech recognition using stochastic segment neural
-                 networks",
-  volume =       "1",
-  institution =  "Lab. for Comput. Sci., MIT, Cambridge, MA, USA",
-  publisher =    "IEEE",
-  address =      "New York, NY, USA",
-  pages =        "613--16",
-  year =         "1992",
-}
-
-@Article{Levenberg44,
-  author =       "K. Levenberg",
-  title =        "A method for the solution of certain non-linear
-                 problems in least squares",
-  journal =      "Quarterly Journal of Applied Mathematics",
-  volume =       "II",
-  number =       "2",
-  pages =        "164--168",
-  year =         "1944",
-}
-
-@InProceedings{Levin90,
-  author =       "E. Levin",
-  booktitle =    icassp,
-  title =        "Word Recognition using Hidden Control Neural
-                 Architecture",
-  address =      "Albuquerque, NM",
-  pages =        "433--436",
-  year =         "1990",
-}
-
-@InProceedings{Levin92,
-  author =       "E. Levin and R. Pieraccini and E. Bocchieri",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Time-Warping Network: a Hybrid Framework for Speech
-                 Recognition",
-  address =      "Denver, CO",
-  pages =        "151--158",
-  year =         "1992",
-}
-
-@Article{Levinson83,
-  author =       "S. E. Levinson and L. R. Rabiner and M. M. Sondhi",
-  title =        "An Introduction to the Application of the Theory of
-                 Probabilistic Functions of a {Markov} Process to
-                 Automatic Speech Recognition",
-  journal =      "Bell System Technical Journal",
-  volume =       "64",
-  number =       "4",
-  pages =        "1035--1074",
-  year =         "1983",
-}
-
-@InCollection{Levinson96,
-  author =       "S. E. Levinson",
-  editor =       "R. A. Cole and J. Mariani and H. Uszkoriet and A.
-                 Zaenen and V. Zue",
-  booktitle =    "Survey of the State of the Art in Human Language
-                 Technology",
-  title =        "Statistical Modeling and Classification",
-  publisher =    "Cambridge University Press",
-  address =      "http://www.cse.ogi.edu/CSLU/HLTsurvey/HLTsurvey.html",
-  pages =        "395--401",
-  year =         "1996",
-}
-
-@phdthesis{Levner2008,
-  author = {Ilya Levner},
-  title = {Data Driven Object Segmentation},
-  school = {Department of Computer Science, University of Alberta},
-  year = 2008,
-}
-
-@InProceedings{Lewicki+Sejnowski-97,
-  author =       "Michael Lewicki and Terry Sejnowski",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "Learning nonlinear overcomplete representations for
-                 efficient coding",
-  publisher =    "MIT Press",
-  isbn = {0-262-10076-2},
-  location = {Denver, Colorado, United States},
-  address = {Cambridge, MA, USA},
-  pages =        "556--562",
-  year =         "1998",
-}
-
-@article{Lewicki+Sejnowski-2000,
-    author = {Michael S. Lewicki and Terrence J. Sejnowski},
-    title = {Learning Overcomplete Representations},
-    journal = {Neural Computation},
-    volume = {12},
-    number = {2},
-    year = {2000},
-    issn = {0899-7667},
-    pages = {337--365},
-    doi = {http://dx.doi.org/10.1162/089976600300015826},
-    publisher = {MIT Press},
-    address = {Cambridge, MA, USA},
-}
-
-@InProceedings{LewisC62,
-    author =       "P. M. {Lewis II} and C. L. Coates",
-    title =        "A realization procedure for threshold gate networks",
-    crossref =     "FOCS3",
-    pages =        "159--168",
-    url =          "http://theory.lcs.mit.edu/~dmjones/FOCS/focs.bib",
-}
-
-@Article{lheureux-04-small,
-  author =       "P.-J. {L'Heureux} and J. Carreau and Y. Bengio and O.
-                 Delalleau and S. Y. Yue",
-  title =        "Locally Linear Embedding for dimensionality reduction
-                 in {QSAR}",
-  journal =      "J. Computer-Aided Molecular Design",
-  pages =        "18.475",
-  year =         "2004",
-}
-
-@Book{Li93,
-  author =       "Ming Li and Paul Vitanyi",
-  title =        "An Introduction to Kolmogorov Complexity and Its
-                 Applications",
-  publisher =    "Second edition, Springer",
-  address =      "New York, NY",
-  year =         "1997",
-}
-
-@Article{li99face,
-  author =       "S. Z. Li and J. W. Lu",
-  title =        "Face recognition using the nearest feature line
-                 method",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "10",
-  number =       "2",
-  pages =        "439--443",
-  year =         "1999",
-  URL =          "citeseer.nj.nec.com/li99face.html",
-}
-
-@inproceedings{Li+al-2005,
-    author    = {Hongyu Li and Wenbin Chen and I-Fan Shen},
-    title     = {Supervised Local Tangent Space Alignment for Classification},
-    booktitle = {IJCAI},
-    year      = {2005},
-    pages     = {1620-1621},
-    ee        = {http://www.ijcai.org/papers/post-0505.pdf},
-    bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@article{Li+Guo-2006,
-    author = {Chun-Guang Li and Jun Guo},
-    title = {Supervised Isomap with Explicit Mapping},
-    journal = {First International Conference on Innovative Computing, Information and Control},
-    volume = {3},
-    year = {2006},
-    isbn = {0-7695-2616-0},
-    pages = {345-348},
-    doi = {http://doi.ieeecomputersociety.org/10.1109/ICICIC.2006.530},
-    publisher = {IEEE Computer Society},
-    address = {Los Alamitos, CA, USA},
-}
-
-@inproceedings{lischuurmans08a,
-author = "Li, Y. and Schuurmans, D.",
-title = "Policy iteration for learning an exercise policy for {American} 
-options",
-booktitle = "Proceedings of the European Workshop on Reinforcement 
-Learning (EWRL)",
-year = 2008,
-note = "Acceptance rate 33\%; all authors from my research group"
-}
-
-@inproceedings{lischuurmans08b,
-author = "Li, Y. and Schuurmans, D.",
-title = "Learning an exercise policy for {American} options on real data",
-booktitle = "Proceedings of the International Symposium on Financial 
-Engineering and Risk Management (FERM)",
-year = 2008,
-note = "All authors from my research group; unrefereed publication"
-}
-
-@inproceedings{Li+al-2007,
-    author    = {Jun-Bao Li and Shu-Chuan Chu and Jeng-Shyang Pan},
-    title     = {Locally Discriminant Projection with Kernels for Feature Extraction},
-    booktitle = {Proceedings of the Third International Conference on Advanced Data Mining and Applications},
-    editor    = {Reda Alhajj and Hong Gao and Xue Li and Jianzhong Li and Osmar R. Za\"{\i}ane},
-    publisher = {Springer},
-    year      = {2007},
-    pages     = {586-593},
-    ee        = {http://dx.doi.org/10.1007/978-3-540-73871-8_56},
-    bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@InCollection{Liang83,
-  author =       "F. M. Liang",
-  editor =       "D. E. Knuth",
-  booktitle =    "The \TeX Book",
-  title =        "Ph.{D}.\ Thesis",
-  publisher =    "Addison-Wesley",
-  address =      "Reading",
-  year =         "1986",
-}
-
-@inproceedings{LiangP2008,
- author = {Percy Liang and Michael I. Jordan},
- title = {An asymptotic analysis of generative, discriminative, and pseudolikelihood estimators},
- booktitle =    ICML08,
- editor =       ICML08ed,
- publisher =    ICML08publ,
- year = {2008},
- isbn = {978-1-60558-205-4},
- pages = {584--591},
- location = {Helsinki, Finland},
- doi = {http://doi.acm.org/10.1145/1390156.1390230},
- address = {New York, NY, USA},
- }
-
-@Article{Liberman67,
-  author =       "A. M. Liberman and F. S. Cooper and D. P. Shankweiler
-                 and M. Studdert-Kennedy",
-  title =        "Perception of the speech code",
-  journal =      "Psychological Review",
-  volume =       "74",
-  pages =        "431--461",
-  year =         "1967",
-}
-
-@Article{Lin+al-1991,
-  author =       "W.-M. Lin and V. K. Prasanna and K. W. Przytula",
-  title =        "Algorithmic mapping of neural network Models onto
-                 Parallel {SIMD} Machines",
-  journal =      "IEEE Transactions on Computers",
-  volume =       "40",
-  number =       "12",
-  publisher =    "IEEE Computer Society",
-  address =      "Los Alamitos, CA, USA",
-  pages =        "1390--1401",
-  year =         "1991",
-  ISSN =         "0018-9340",
-  doi =          "http://doi.ieeecomputersociety.org/10.1109/12.106224",
-}
-
-@Article{Lin-2000,
-  author =       "Dekang Lin",
-  title =        "Word sense disambigutation with a similarity based
-                 smoothed library",
-  journal =      "Computers and the Humanities: special issue on
-                 {SENSEVAL}",
-  volume =       "34",
-  pages =        "147--152",
-  year =         "2000",
-}
-
-@InProceedings{Lin-99,
-  author =       "Dekang Lin",
-  booktitle =    "Proceedings of the Conference of the Pacific
-                 Association for Computational Linguistics",
-  title =        "A case-based algorithm for word sense disambiguation",
-  address =      "Waterloo, Canada",
-  year =         "1999",
-}
-
-@Article{Lin73,
-  author =       "S. Lin and B. W. Kernighan",
-  title =        "An Effective Heuristic Algorithm for the Travelling
-                 Salesman Problem",
-  journal =      opres,
-  volume =       "21",
-  pages =        "498--516",
-  year =         "1973",
-}
-
-@TechReport{Lin95,
-  author =       "T. Lin and B. G. Horne and P. Tino and C. L. Giles",
-  title =        "Learning long-term dependencies is not as difficult
-                 with {NARX} recurrent neural networks",
-  number =       "UMICAS-TR-95-78",
-  institution =  "Institute for Advanced Computer Studies, University of
-                 Mariland",
-  year =         "1995",
-}
-
-@InProceedings{Lin96,
-  author =       "C. Lin and S-C. Chang and K-J. Lin",
-  booktitle =    nipc-hmit96,
-  title =        "Simulation of the Balance of Plant of a Nuclear Power
-                 Plant by Neural Networks",
-  volume =       "1",
-  publisher =    ans,
-  pages =        "251--255",
-  year =         "1996",
-}
-
-@Article{Linde80,
-  author =       "Y. Linde and A. Buzo and R. M. Gray",
-  title =        "An algorithm for vector quantizer design",
-  journal =      "IEEE Transactions on Communication",
-  volume =       "COM-28",
-  number =       "1",
-  pages =        "84--95",
-  month =        jan,
-  year =         "1980",
-}
-
-@Article{Lindgren78,
-  author =       "G. Lindgren",
-  title =        "{Markov} Regime Models for Mixed Distributions and
-                 Switching Regressions",
-  journal =      "Scan. J. Statist.",
-  volume =       "5",
-  pages =        "81--91",
-  year =         "1978",
-}
-
-@Article{Linial93,
-  author =       "Nathan Linial and Yishay Mansour and Noam Nisan",
-  title =        "Constant depth circuits, {Fourier} transform, and
-                 learnability",
-  journal =      "J. ACM",
-  volume =       "40",
-  number =       "3",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "607--620",
-  year =         "1993",
-}
-
-@Article{Linsker86,
-  author =       "R. Linsker",
-  title =        "From Basic Network Principles to Neural Architecture",
-  journal =      PNAS,
-  volume =       "83",
-  pages =        "7508--7512, 8390--8394, 8779--8783",
-  year =         "1986",
-}
-
-@Article{Linsker88,
-  author =       "R. Linsker",
-  title =        "Self-Organization in a Perceptual Network",
-  journal =      computer,
-  pages =        "105--117",
-  month =        mar,
-  year =         "1988",
-}
-
-@TechReport{liporace-76,
-  author =       "L. A. Liporace",
-  title =        "{PTAH} on Continuous Multivariate Functions of
-                 {Markov} Chains",
-  number =       "80193",
-  institution =  "Institute for Defense Analysis, Communication Research
-                 Department",
-  month =        feb,
-  year =         "1976",
-}
-
-@Article{Lippmann87,
-  author =       "R. P. Lippmann",
-  title =        "An Introduction to Computing with Neural Nets",
-  journal =      ieeeassp,
-  pages =        "4--22",
-  month =        apr,
-  year =         "1987",
-}
-
-@InProceedings{Lippmann87b,
-  author =       "R. P. Lippmann and B. Gold",
-  booktitle =    "IEEE Proc. First Intl. Conf. on Neural Networks",
-  title =        "Neural Classifiers Useful for Speech Recognition",
-  volume =       "IV",
-  address =      "San Diego, CA",
-  pages =        "417--422",
-  year =         "1987",
-}
-
-@Article{Lippmann89,
-  author =       "R. P. Lippmann",
-  title =        "Review of Neural Networks for Speech Recognition",
-  journal =      nc,
-  volume =       "1",
-  pages =        "1--38",
-  year =         "1989",
-}
-
-@InProceedings{Lister90,
-  author =       "R. Lister",
-  booktitle =    ijcnn,
-  title =        "Segment Reversal and the {TSP}",
-  volume =       "1",
-  publisher =    "Lawrence Erlbaum, Hillsdale",
-  address =      "Washington 1990",
-  pages =        "424--427",
-  year =         "1990",
-}
-
-@Article{Litkowski-2000,
-  author =       "K. Litkowski",
-  title =        "{SENSEVAL}: The {CL}-research experience",
-  journal =      "Computers and the Humanities: special issue on
-                 SENSEVAL",
-  volume =       "34",
-  pages =        "153--158",
-  year =         "2000",
-}
-
-@Book{Little+Rubin-2002,
-  author =       "R. J. A. Little and D. B. Rubin",
-  title =        "Statistical Analysis with Missing Data",
-  publisher =    "Wiley",
-  address =      "New York",
-  edition =      "2nd",
-  year =         "2002",
-}
-
-@Book{Little-Rubin,
-  author =       "R. J. A. Little and D. B. Rubin",
-  title =        "Statistical Analysis with Missing Data",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1987",
-}
-
-@Article{Little74,
-  author =       "W. A. Little",
-  title =        "The Existence of Persistent States in the Brain",
-  journal =      mbio,
-  volume =       "19",
-  pages =        "101--120",
-  year =         "1974",
-}
-
-@Article{Little75,
-  author =       "W. A. Little and G. L. Shaw",
-  title =        "A Statistical Theory of Short and Long Term Memory",
-  journal =      behbio,
-  volume =       "14",
-  year =         "1975",
-}
-
-@Article{Little78,
-  author =       "W. A. Little and G. L. Shaw",
-  title =        "Analytic Study of the Memory Storage Capacity of a
-                 Neural Network",
-  journal =      mbio,
-  volume =       "39",
-  pages =        "281--290",
-  year =         "1978",
-}
-
-@Article{littlestone-warmuth94,
-  author =       "N. Littlestone and M. K. Warmuth",
-  title =        "The weighted majority algorithm",
-  journal =      "Information and Computation",
-  volume =       "108",
-  number =       "2",
-  pages =        "212--261",
-  year =         "1994",
-}
-
-@Misc{Littlestone86,
-  author =       "N. Littlestone and M. Warmuth",
-  title =        "Relating data compression and learnability",
-  year =         "1986",
-  note =         "Unpublished manuscript. University of California Santa
-                 Cruz. An extended version can be found in (Floyd and
-                 Warmuth 95)",
-}
-
-@InCollection{Liu2001,
-  author =       "J. S. Liu & R. Chen & T. Logvinenko",
-  editor =       "N. Gordon {A. Doucet, N. de Freitas}",
-  booktitle =    "Sequential Monte Carlo Methods in Practice",
-  title =        "A theoretical framework for sequential importance
-                 sampling and resampling",
-  publisher =    "Springer-Verlag",
-  year =         "2001",
-}
-
-@Book{Ljung+Soderstrom83,
-  author =       "L. Ljung and T. Soderstrom",
-  title =        "Theory and Practice of recursive identification",
-  publisher =    "MIT Press",
-  year =         "1983",
-}
-
-@Book{Ljung-86,
-  author =       "L. Lyung and T. S{\"o}derstr{\"o}m",
-  title =        "Theory and Practice of Recursive Identification",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "1986",
-}
-
-@article{LloydS1982,
-	author = {Stuart P. Lloyd },
-	booktitle = {Information Theory, IEEE Transactions on},
-	journal = {Information Theory, IEEE Transactions on},
-	number = {2},
-	pages = {129--137},
-	title = {Least squares quantization in PCM},
-	volume = {28},
-	year = {1982}
-}
-
-@Article{Loader96,
-  author =       "C. R. Loader",
-  title =        "Local lieklihood density estimation",
-  journal =      "Annals of Statistics",
-  volume =       "24",
-  number =       "4",
-  pages =        "1602--1618",
-  year =         "1996",
-}
-
-@Article{Loftsgaarden+Quesenberry-65,
-  author =       "D. O. Loftsgaarden and C. P. Quesenberry",
-  title =        "A nonparametric estimate of a multivariate density
-                 function",
-  journal =      "Annals of Mathematical Statistics",
-  volume =       "36",
-  pages =        "1049--1051",
-  year =         "1965",
-}
-
-@InBook{lognormal-A-85,
-  author =       "C. E. Antle",
-  booktitle =    "Encyclopedia of Statistical Sciences",
-  title =        "Lognormal Distribution",
-  volume =       "5",
-  publisher =    "John Wiley \& Sons",
-  pages =        "134--136",
-  year =         "1985",
-}
-
-@Article{Loh-Shih97,
-  author =       "Wei-Yin Loh and Yu-Shan Shih",
-  title =        "Split selection methods for classification trees",
-  journal =      "Statistica Sinica",
-  volume =       "7",
-  pages =        "815--840",
-  year =         "1997",
-}
-
-@incollection{loosli-canu-bottou-2006,
-  author = {Loosli, Ga\"{e}lle and Canu, St\'{e}phane and Bottou, L\'{e}on},
-  title = {Training Invariant Support Vector Machines using Selective Sampling},
-  pages = {301-320},
-  editor = {Bottou, L\'{e}on and Chapelle, Olivier and {DeCoste}, Dennis and Weston, Jason},
-  booktitle = {Large Scale Kernel Machines},
-  publisher = {MIT Press},
-  address = {Cambridge, MA.},
-  year = {2007},
-  url = {http://leon.bottou.org/papers/loosli-canu-bottou-2006},
-}
-
-@Article{Lowe04,
-  author =       "D. G. Lowe",
-  title =        "Distinctive Image Features from Scale-Invariant
-                 Keypoints",
-  journal =      "International Journal of Computer Vision",
-  volume =       "60",
-  number =       "2",
-  pages =        "91--110",
-  year =         "2004",
-}
-
-@Article{Lowe95,
-  author =       "D. G. Lowe",
-  title =        "Similarity metric learning for a variable-kernel
-                 classifier",
-  journal =      "Neural Computation",
-  volume =       "7",
-  number =       "1",
-  pages =        "72--85",
-  year =         "1995",
-}
-
-@InProceedings{lu04,
-  author =       "Wen-Cong Lu and Nian-Yi Chen and Guo-Zheng Li and Jie
-                 Yang",
-  editor =       "Per Svensson and Johan Schubert",
-  booktitle =    "Proceedings of the Seventh International Conference on
-                 Information Fusion",
-  title =        "Multitask learning using partial least square method",
-  volume =       "I",
-  publisher =    "International Society of Information Fusion",
-  address =      "Mountain View, CA",
-  pages =        "79--84",
-  month =        jun,
-  year =         "2004",
-  location =     "Stockholm, Sweden",
-}
-
-@Book{Lue84,
-  author =       "D. G. Luenberger",
-  title =        "Linear and Nonlinear Programming",
-  publisher =    "Addison Wesley",
-  year =         "1984",
-}
-
-@Book{Luenberger86,
-  author =       "D. G. Luenberger",
-  title =        "Linear and Nonlinear Programming",
-  publisher =    "Addison-Wesley",
-  address =      "Reading",
-  year =         "1986",
-}
-
-@InProceedings{Lyu09,
-  author =       "Siwei Lyu",
-  booktitle =    "The proceedings of the 25th Conference on Uncertainty in Artificial Intelligence",
-  title =        "Interpretation and Generalization of Score Matching",
-  year =         "2009",
-}
-
-@Book{Ma85,
-  author =       "S.-K. Ma",
-  title =        "Statistical Mechanics",
-  publisher =    "World Scientific",
-  address =      "Philadelphia",
-  year =         "1985",
-}
-
-@InProceedings{Ma09,
- author = {Justin Ma and Lawrence K. Saul and Stefan Savage and Geoffrey M. Voelker},
- title = {Identifying Suspicious URLs: An Application of Large-Scale Online Learning},
- booktitle = {Proceedings of the International Conference on Machine Learning},
- year = {2009},
- pages = {681--688},
- location = {Montreal, Canada},
-}
-
-@Misc{MacKay+Neal94,
-  author =       "D. MacKay and R. Neal",
-  title =        "Automatic Relevance Determination",
-  year =         "1994",
-  note =         "Unpublished report. See also MacKay D., 1995, Probable
-                 Neworks and Plausible Predictions -- A Review of
-                 Practical {Bayesian} Methods for Supervised Neural
-                 Networks, in {\em Network: Computation in Neural
-                 Systems}, v. 6, pp. 469--505",
-}
-
-@Book{MacKay03,
-  author =       "David MacKay",
-  title =        "Information Theory, Inference and Learning
-                 Algorithms",
-  publisher =    "Cambridge University Press",
-  year =         "2003",
-}
-
-@Misc{MacKay2001,
-  author =       "David MacKay",
-  title =        "Failures of the One-Step Learning Algorithm",
-  year =         "2001",
-  note =         "Unpublished report",
-}
-
-@Article{MacKay90,
-  author =       "D. J. C. MacKay and K. D. Miller",
-  title =        "Analysis of Linsker's Simulation of Hebbian Rules",
-  journal =      nc,
-  volume =       "2",
-  pages =        "173--187",
-  year =         "1990",
-}
-
-@PhdThesis{MacKay91,
-  author =       "D. J. C. MacKay",
-  title =        "Bayesian methods for adaptive models",
-  school =       "California Institute of Technology",
-  year =         "1991",
-}
-
-@Article{MacKay92a,
-  author =       "David {J. C}. MacKay",
-  title =        "Bayesian interpolation",
-  journal =      "Neural Computation",
-  volume =       "4",
-  number =       "3",
-  pages =        "415--447",
-  year =         "1992",
-}
-
-@Article{MacKay92b,
-  author =       "D. J. C. MacKay",
-  title =        "The evidence framework applied to classification
-                 networks",
-  journal =      "Neural Computation",
-  volume =       "4",
-  number =       "5",
-  pages =        "698--714",
-  year =         "1992",
-}
-
-@Article{MacKay92c,
-  author =       "David {J. C}. MacKay",
-  title =        "A practical {Bayesian} framework for backpropagation
-                 networks",
-  journal =      "Neural Computation",
-  volume =       "4",
-  number =       "3",
-  pages =        "448--472",
-  year =         "1992",
-}
-
-@Article{MacKay98,
-  author =       "D. J. C. MacKay and R. J. McEliece and J-F. Cheng (in
-                 press)",
-  title =        "Turbo-decoding as an instance of Pearl's belief
-                 propagation algorithm",
-  journal =      "IEEE Journal on Selected Areas in Communications",
-  year =         "1998",
-}
-
-@TechReport{MacKay98b,
-  author =       "D. J. C. MacKay",
-  title =        "Introduction to {G}aussian Processes",
-  institution =  "Cambridge University",
-  year =         "1998",
-  URL =          "http://wol.ra.phy.cam.ac.uk/mackay/gpB.pdf",
-}
-
-@Article{Mackey77,
-  author =       "M. C. Mackey and L. Glass",
-  title =        "Oscillation and Chaos in Physiological Control
-                 Systems",
-  journal =      science,
-  volume =       "197",
-  pages =        "287",
-  year =         "1977",
-}
-
-@InProceedings{Maclin-iwml91,
-  author =       "R. Maclin and J. W. Shawlik",
-  editor =       "L. Birnbaum and G. Collins",
-  booktitle =    "Machine Learning: Proceedings of the Eighth
-                 International Workshop",
-  title =        "Refining Domain Theories Expressed as Finite-State
-                 Automata",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  year =         "1991",
-}
-
-@Article{Maclin-ml,
-  author =       "R. Maclin and J. W. Shawlik",
-  title =        "Using Knowledge-Based Neural Networks to Improve
-                 Algorithms: Refining the Chou-Fasman Algorithm for
-                 Protein Folding",
-  journal =      mlearn,
-}
-
-@InProceedings{MacQueen67,
-  author =       "James B. MacQueen",
-  booktitle =    "Proceedings of the Fifth Berkeley Symposium on
-                 Mathematics, Statistics and Probability, Vol. 1",
-  title =        "Some Methods for Classification and Analysis of
-                 Multivariate Observations",
-  pages =        "281--296",
-  year =         "1967",
-}
-
-@Article{Mahapatra+al-1997,
-  author =       "S. Mahapatra and R. N. Mahapatra and B. N. Chatterji",
-  title =        "A parallel formulation of back-propagation learning on
-                 distributed memory multiprocessors",
-  journal =      "Parallel Computing",
-  volume =       "22",
-  number =       "12",
-  publisher =    "Elsevier Science Publishers",
-  address =      "Amsterdam, The Netherlands",
-  pages =        "1661--1675",
-  year =         "1997",
-  ISSN =         "0167-8191",
-  doi =          "http://dx.doi.org/10.1016/S0167-8191(96)00051-8",
-}
-
-@incollection{Mairal-2009,
- title = {Supervised Dictionary Learning},
- author = {Julien Mairal and Francis Bach and Jean Ponce and Guillermo Sapiro and Andrew Zisserman},
- booktitle = NIPS21,
- editor = NIPS21ed,
- pages = {1033--1040},
- publisher = {NIPS Foundation},
- year = {2009}
-}
-@book{Maimon+Rokach-2005,
-    author = {Maimon, O.  and Rokach, L. },
-    howpublished = {Hardcover},
-    isbn = {0387244352},
-    month = {September},
-    publisher = {Springer},
-    title = {Data Mining and Knowledge Discovery Handbook},
-    year = {2005}
-}
-
-@InProceedings{Makram-Ebeid89,
-  author =       "S. Makram-Ebeid and J.-A. Sirat and J.-R. Viala",
-  booktitle =    ijcnn,
-  title =        "A Rationalized Back-Propagation Learning Algorithm",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "373--380",
-  year =         "1989",
-}
-
-@Article{mallat93matching,
-  author =       "S. Mallat and Z. Zhang",
-  title =        "Matching pursuit with time-frequency dictionaries",
-  journal =      "IEEE Trans. Signal Proc.",
-  volume =       "41",
-  number =       "12",
-  pages =        "3397--3415",
-  month =        dec,
-  year =         "1993",
-}
-
-@InProceedings{malouf2002conll,
-  author =       "Robert Malouf",
-  booktitle =    "Proceedings of CoNLL-2002",
-  title =        "A comparison of algorithms for maximum entropy
-                 parameter estimation",
-  publisher =    "Taipei, Taiwan",
-  pages =        "49--55",
-  year =         "2002",
-  editors =      "Dan Roth and Antal van den Bosch",
-}
-
-@Book{Mandelbrot82,
-  author =       "B. B. Mandelbrot",
-  title =        "The Fractal Geometry of Nature",
-  publisher =    "Freeman",
-  address =      "San Francisco",
-  year =         "1982",
-}
-
-@Book{Manning+Schutze99,
-  author =       "Christopher Manning and Hinrich Schutze",
-  title =        "Foundations of Statistical Natural Language
-                 Processing",
-  publisher =    "MIT Press",
-  year =         "1999",
-}
-
-@InProceedings{Mantysalo92firenze,
-  author =       "Jyri M{\"{a}}ntysalo and Kari Torkkola and Teuvo
-                 Kohonen",
-  booktitle =    "Proc. of the Second Workshop on Neural Networks for
-                 Speech Processing",
-  title =        "Experiments on the use of {LVQ} in phoneme-level
-                 segmentation of speech",
-  publisher =    "LINT",
-  address =      "Firenze (Italy)",
-  year =         "1992",
-}
-
-@article{Marcelja-1980,
-    author = {Marcelja, S.},
-    journal = {Journal of the Optical Society of America},
-    month = {November},
-    number = {11},
-    pages = {1297--1300},
-    title = {Mathematical description of the responses of simple cortical cells.},
-    url = {http://view.ncbi.nlm.nih.gov/pubmed/7463179},
-    volume = {70},
-    year = {1980}
-}
-
-@Article{Marchand90,
-  author =       "M. Marchand and M. Golea and P. Ruj\'an",
-  title =        "A Convergence Theorem for Sequential Learning in
-                 Two-Layer Perceptrons",
-  journal =      eul,
-  volume =       "11",
-  pages =        "487--492",
-  year =         "1990",
-}
-
-@Article{Marcotte-92,
-  author =       "P. Marcotte and G. Savard",
-  title =        "Novel approaches to the discrimination problem",
-  journal =      "Zeitschrift f{\"u}r Operations Research (Theory)",
-  volume =       "36",
-  pages =        "517--545",
-  year =         "1992",
-}
-
-@Article{Marcus91,
-  author =       "C. M. Marcus and F. R. Waugh and R. M. Westervelt",
-  title =        "Nonlinear Dynamics and Stability of Analog Neural
-                 Networks",
-  journal =      "Physica D",
-  volume =       "51",
-  pages =        "234--247",
-  year =         "1991",
-  note =         "(special issue)",
-}
-
-@Article{Marcus-et-al91,
-  author =       "C. M. Marcus and F. R. Waugh and R. M. Westervelt",
-  title =        "Nonlinear Dynamics and Stability of Analog Neural
-                 Networks",
-  journal =      physicaD,
-  volume =       "51",
-  pages =        "1991",
-  year =         "1991",
-  note =         "(special issue)",
-}
-
-@Article{Markov13,
-  author =       "A. A. Markov",
-  title =        "An example of statistical investigation in the text of
-                 `Eugene Onyegin' illustrating coupling of `tests' in
-                 chains",
-  journal =      "Proceedings of the Academy of Science, St.
-                 Petersburg",
-  volume =       "7",
-  pages =        "153--162",
-  year =         "1913",
-}
-
-@Article{Markovitz-52,
-  author =       "H. M. Markovitz",
-  title =        "Portfolio Selection",
-  journal =      "Journal of Finance",
-  volume =       "7",
-  number =       "1",
-  pages =        "77--91",
-  year =         "1952",
-}
-
-@InProceedings{maron98,
-  author =       "Oded Maron and Tom\'{a}s Lozano-P\'{e}rez",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "A Framework for Multiple-Instance Learning",
-  volume =       "10",
-  publisher =    "{MIT} Press",
-  year =         "1998",
-}
-
-@Article{Marquardt63,
-  author =       "D. W. Marquardt",
-  title =        "An algorithm for least-squares estimation of
-                 non-linear parameters",
-  journal =      "Journal of the Society of Industrial and Applied
-                 Mathematics",
-  volume =       "11",
-  number =       "2",
-  pages =        "431--441",
-  year =         "1963",
-}
-
-@Article{Marr69,
-  author =       "D. Marr",
-  title =        "A Theory of Cerebellar Cortex",
-  journal =      jphysiol,
-  volume =       "202",
-  pages =        "437--470",
-  year =         "1969",
-}
-
-@Article{Marr70,
-  author =       "D. Marr",
-  title =        "A Theory for Cerebral Neocortex",
-  journal =      PRSLB,
-  volume =       "176",
-  pages =        "161--234",
-  year =         "1970",
-}
-
-@Article{Marr71,
-  author =       "D. Marr",
-  title =        "Simple Memory: {A} Theory for Archicortex",
-  journal =      PTRSL,
-  volume =       "262",
-  pages =        "23--81",
-  year =         "1971",
-}
-
-@Article{Marr76,
-  author =       "D. Marr and T. Poggio",
-  title =        "Cooperative Computation of Stereo Disparity",
-  journal =      science,
-  volume =       "194",
-  year =         "1976",
-}
-
-@Book{Marr82,
-  author =       "D. Marr",
-  title =        "Vision",
-  publisher =    "Freeman",
-  address =      "San Francisco",
-  year =         "1982",
-}
-
-@Article{Martin91,
-  author =       "G. L. Martin and J. A. Pittman",
-  title =        "Recognizing hand-printed letters and digits using
-                 backpropagation learning",
-  journal =      nc,
-  volume =       "3",
-  number =       "2",
-  pages =        "258--267",
-  year =         "1991",
-}
-
-@Article{Mashouk+Reed91,
-  author =       "K. A. Al-Mashouq and I. S. Reed",
-  title =        "Including Hints in Training Neural Nets",
-  journal =      nc,
-  volume =       "3",
-  number =       "4",
-  pages =        "418",
-  year =         "1991",
-}
-
-@InProceedings{Mason98,
-  author =       "L. Mason and Bartlett and J. P. Baxter",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Direct Optimization of Margins Improves Generalization
-                 in Combined Classifiers",
-  year =         "1999",
-}
-
-@InProceedings{Mason99,
-  author =       "L. Mason and J. Baxter and P. Bartlett and M. Frean",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Boosting Algorithms as Gradient Descent",
-  publisher =    "MIT Press",
-  pages =        "512--518",
-  year =         "2000",
-}
-
-@InProceedings{Matan92,
-  author =       "O. Matan and C. J. C. Burges and Y. {LeCun} and J. S.
-                 Denker",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Multi-Digit Recognition Using a Space Displacement
-                 Neural Network",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "488--495",
-  year =         "1992",
-}
-
-@InProceedings{matic-92a,
-  author =       "N. Mati\'{c} and I. Guyon and L. Bottou and J. Denker
-                 and V. Vapnik",
-  booktitle =    "11th International Conference on Pattern Recognition",
-  title =        "Computer Aided Cleaning of Large Databases for
-                 Character Recogn ition",
-  volume =       "II",
-  pages =        "330--333",
-  year =         "1992",
-}
-
-@Misc{matrix-cookbook,
-  author =       "K. B. Petersen and M. S. Pedersen",
-  title =        "The Matrix Cookbook",
-  publisher =    "Technical University of Denmark",
-  address =      "",
-  month =        feb,
-  year =         "2006",
-  note =         "Version 20051003",
-  abstract =     "Matrix identities, relations and approximations. A
-                 desktop reference for quick overview of mathematics of
-                 matrices.",
-  keywords =     "Matrix identity, matrix relations, inverse, matrix
-                 derivative",
-}
-
-@Article{Mattis76,
-  author =       "D. Mattis",
-  title =        "Solvable Spin Systems with Random Interactions",
-  journal =      plettA,
-  volume =       "56",
-  pages =        "421--422",
-  year =         "1976",
-}
-
-@Article{MaxEnt96,
-  author =       "Adam L. Berger and Vincent J. {Della Pietra} and Stephen A. {Della
-                 Pietra}",
-  title =        "A maximum entropy approach to natural language
-                 processing",
-  journal =      "Computational Linguistics",
-  volume =       "22",
-  pages =        "39--71",
-  year =         "1996",
-}
-
-@Article{Mayraz+Hinton-2002,
-  author =       "G. Mayraz and G. E. Hinton",
-  title =        "Recognizing handwritten digits using hierarchical
-                 products of experts",
-  journal =      "IEEE Transactions on Pattern Analysis and Machine
-                 Intelligence",
-  volume =       "24",
-  pages =        "189--197",
-  year =         "2002",
-}
-
-@InProceedings{Mazaika87,
-  author =       "P. K. Mazaika",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "A Mathematical Model of the {Boltzmann} Machine",
-  volume =       "3",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "157--163",
-  year =         "1987",
-}
-
-@InProceedings{mbbf-bagd-00,
-  author =       "L. Mason and J. Baxter and P. L. Bartlett and M.
-                 Frean",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Boosting algorithms as gradient descent",
-  pages =        "512--518",
-  year =         "2000",
-}
-
-@InProceedings{McCallum+Nigam-1998,
-  author =       "A. {McCallum} and K. Nigam",
-  booktitle =    ICML08,
-  editor =       ICML08ed,
-  publisher =    ICML08publ,
-  title =        "Employing {EM} and pool-based active learning for text
-                 classification",
-  year =         "1998",
-}
-
-@InProceedings{McCallumA2006,
-  author =       "Andrew McCallum and Chris Pal and Gregory Druck and
-                 Xuerui Wang",
-  booktitle =    "Twenty-first National Conference on Artificial
-                 Intelligence (AAAI-06)",
-  title =        "Multi-Conditional Learning: Generative/Discriminative
-                 Training for Clustering and Classification",
-  publisher =    "AAAI Press",
-  year =         "2006",
-  OPTbibsource = "DBLP, http://dblp.uni-trier.de",
-  OPTcrossref =  "DBLP:conf/aaai/2006",
-}
-
-@article{McClelland+Rumelhart-81,
- author = {James L. {McClelland} and David E. Rumelhart},
- title = {An interactive activation model of context effects in letter perception},
- journal = {Psychological Review},
- volume = 88,
- pages = {375--407},
- year = 1981,
-}
-
-@Book{McClelland86a,
-  author =       "James L. McClelland and David E. Rumelhart and the PDP
-                 Research Group",
-  title =        "Parallel Distributed Processing: Explorations in the
-                 Microstructure of Cognition",
-  volume =       "2",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1986",
-}
-
-@InCollection{McClelland86b,
-  author =       "J. L. McClelland and J. L. Elman",
-  editor =       "J. L. McClelland and D. E. Rumelhart",
-  booktitle =    pdp,
-  title =        "Interactive Processes in Speech Perception: The
-                 {TRACE} Model",
-  chapter =      "15",
-  volume =       "2",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  pages =        "58--121",
-  year =         "1986",
-}
-
-@Book{McClelland88,
-  author =       "J. L. McClelland and D. E. Rumelhart",
-  title =        "Explorations in Parallel Distributed Processing",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1988",
-}
-
-@Article{McCulloch43,
-  author =       "W. S. McCulloch and W. Pitts",
-  title =        "A Logical Calculus of Ideas Immanent in Nervous
-                 Activity",
-  journal =      bmbiophys,
-  volume =       "5",
-  pages =        "115--133",
-  year =         "1943",
-}
-
-@InProceedings{Mcdermott89,
-  author =       "E. McDermott and S. Katagiri",
-  booktitle =    icassp,
-  title =        "Shift-Invariant, Multi-Category Phoneme Recognition
-                 using {Kohonen's} {LVQ2}",
-  volume =       "1",
-  organization = "IEEE",
-  address =      "Glasgow, Scotland",
-  pages =        "81--84",
-  year =         "1989",
-}
-
-@Article{Mcdermott91,
-  author =       "E. McDermott and S. Katagiri",
-  title =        "{LVQ}-based shift-tolerant phoneme recognition",
-  journal =      "IEEE Transactions on Signal Processing",
-  volume =       "39",
-  number =       "6",
-  pages =        "1398--1411",
-  year =         "1991",
-  OPTmonth =     "June",
-}
-
-@Article{McEliece87,
-  author =       "R. J. McEliece and E. C. Posner and E. R. Rodemich and
-                 S. S. Venkatesh",
-  title =        "The Capacity of the Hopfield Associative Memory",
-  journal =      ieeeit,
-  volume =       "33",
-  pages =        "461--482",
-  year =         "1987",
-}
-
-@InProceedings{McInerny89,
-  author =       "J. M. McInerny and K. G. Haines and S. Biafore and R.
-                 Hecht-Nielsen",
-  booktitle =    ijcnn,
-  title =        "Back Propagation Error Surfaces Can Have Local
-                 Minima",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "627",
-  year =         "1989",
-}
-
-@Book{McLachlan2000,
-  author =       "G. J. McLachlan and D. Peel",
-  title =        "Finite Mixture Models",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "2000",
-}
-
-@Book{McLachlan88,
-  author =       "G. J. McLachlan and K. E. Basford",
-  title =        "Mixture models: Inference and applications to
-                 clustering.",
-  publisher =    "Marcel Dekker",
-  year =         "1988",
-}
-
-@book{Mclachlan-2004,
-    author = {Geoffrey  J. Mclachlan},
-    howpublished = {Paperback},
-    isbn = {0471691151},
-    month = {August},
-    publisher = {Wiley-Interscience},
-    title = {Discriminant Analysis and Statistical Pattern Recognition},
-    year = {2004}
-}
-
-@Article{McLoone+Irwin-1997,
-  author =       "S. McLoone and G. W. Irwin",
-  title =        "Fast Parallel Off-Line Training of Multilayer
-                 Perceptrons",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "8",
-  number =       "3",
-  pages =        "646--653",
-  year =         "1997",
-}
-
-@Book{Mead89,
-  author =       "C. Mead",
-  title =        "Analog {VLSI} and Neural Systems",
-  publisher =    "Addison Wesley",
-  address =      "Reading",
-  year =         "1989",
-}
-
-@InProceedings{Meila96,
-  author =       "M. Meila and M. I. Jordan",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Learning fine motion by Markov mixtures of experts",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@InProceedings{Mel+Koch90,
-  author =       "Bartlett W. Mel and Christof Koch",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "{Sigma}-{Pi} Learning: On Radial Basis Functions and
-                 Cortical Associative Learning",
-  publisher =    "Morgan Kaufmann",
-  pages =        "474--481",
-  year =         "1990",
-}
-
-@InProceedings{Melvilleetal,
-  author =       "P. Melville and R. J. Mooney and R. Nagarajan",
-  booktitle =    "Proceedings of the ACM SIGIR Workshop on Recommender
-                 Systems",
-  title =        "Content-boosted collaborative filtering",
-  month =        sep,
-  year =         "2001",
-  keywords =     "boosted collaborative filtering content",
-  location =     "New Orleans, LA",
-}
-
-@InProceedings{Memisevic+Hinton-2007,
-  author =       "Roland Memisevic and Geoffrey E. Hinton",
-  booktitle =    cvpr07,
-  title =        "Unsupervised learning of image transformations",
-  year =         "2007",
-}
-
-@PhdThesis{Memisevic-thesis,
-  author =       "Roland Memisevic",
-  title =        "Non-linear latent factor models for revealing
-                 structure in high-dimensional data",
-  school =       "Departement of Computer Science, University of
-                 Toronto",
-  address =      "Toronto, Ontario, Canada",
-  year =         "2007",
-}
-
-@Book{Mendelson97,
-  author =       "E. Mendelson",
-  title =        "Introduction to Mathematical Logic, 4th ed.",
-  publisher =    "Chapman \& Hall",
-  year =         "1997",
-}
-
-@InProceedings{Merkel-1994,
-  author =       "Magnus Merkel and Bernt Nilsson and Lars Ahrenberg",
-  booktitle =    "Proceedings of the 4th Workshop on Very Large
-                 Corpora",
-  title =        "A Phrase-Retrieval System Based on Recurrence",
-  address =      "Tokyo, Japan",
-  year =         "1994",
-}
-
-@InProceedings{Merkel-2000,
-  author =       "Magnus Merkel and Mikael Andersson",
-  booktitle =    "Proceedings of RIAO'2000",
-  title =        "Knowledge-lite extraction of multi-word units with
-                 language filters and entropy thresholds",
-  volume =       "1",
-  pages =        "737--746",
-  year =         "2000",
-}
-
-@InProceedings{Merlo86,
-  author =       "E. Merlo and R. De Mori and G. Mercier and M.
-                 Palakal",
-  booktitle =    icassp,
-  title =        "A continuous parameter and frequency domain based
-                 {Markov} model",
-  pages =        "1597--1600",
-  year =         "1986",
-}
-
-@article{Merzenich-2000,
-    title = {Seeing in the Sound Zone},
-    author = {M. Merzenich},
-    journal = {Nature},
-    pages = {820--821},
-    volume = {404},
-    year = {2000},
-}
-
-@Article{Metropolis53,
-  author =       "N. Metropolis and A. W. Rosenbluth and M. N.
-                 Rosenbluth and A. H. Teller and E. Teller",
-  title =        "Equation of State Calculations for Fast Computing
-                 Machines",
-  journal =      jcp,
-  volume =       "21",
-  pages =        "1087--1092",
-  year =         "1953",
-}
-
-@Article{Mezard85,
-  author =       "M. M\'ezard and G. Parisi",
-  title =        "Replicas and Optimization",
-  journal =      jppl,
-  volume =       "46",
-  pages =        "771--778",
-  year =         "1985",
-}
-
-@Article{Mezard86,
-  author =       "M. M\'ezard and G. Parisi",
-  title =        "A Replica Analysis of the Travelling Salesman
-                 Problem",
-  journal =      jpp,
-  volume =       "47",
-  pages =        "1285--1296",
-  year =         "1986",
-}
-
-@Book{Mezard87,
-  author =       "M. M\'ezard and G. Parisi and M. A. Virasoro",
-  title =        "Spin Glass Theory and Beyond",
-  publisher =    "World Scientific",
-  address =      "Singapore",
-  year =         "1987",
-}
-
-@Article{Mezard88,
-  author =       "M. M\'ezard and G. Parisi",
-  title =        "The Euclidean Matching Problem",
-  journal =      jpp,
-  volume =       "49",
-  pages =        "2019--2025",
-  year =         "1988",
-}
-
-@Article{Mezard89,
-  author =       "M. M\'ezard and J.-P. Nadal",
-  title =        "Learning in Feedforward Layered Networks: The Tiling
-                 Algorithm",
-  journal =      jpa,
-  volume =       "22",
-  pages =        "2191--2204",
-  year =         "1989",
-}
-
-@Article{Micchelli-1986,
-  author =       "C. A. Micchelli",
-  title =        "Interpolation of scattered data: distance matrices and
-                 conditionally positive definite functions",
-  journal =      "Constructive Approximation",
-  volume =       "2",
-  pages =        "11--22",
-  year =         "1986",
-}
-
-@InProceedings{micchelli05,
-  author =       "Charles A. {Micchelli} and Massimiliano {Pontil}",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Kernels for Multi--task Learning",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "921--928",
-  year =         "2005",
-}
-
-@InProceedings{Mihalcea2002,
-  author =       "Rada Mihalcea",
-  booktitle =    "Proceedings of the 6th Conference on Natural Language
-                 Learning",
-  title =        "Instance Based Learning with Automatic Feature
-                 Selection Applied to Word",
-  year =         "2002",
-  URL =          "citeseer.nj.nec.com/587173.html",
-}
-
-@Article{Miikkulainen91,
-  author =       "R. Miikkulainen and M. G. Dyer",
-  title =        "Natural language processing with modular {PDP}
-                 networks and distributed lexicon",
-  journal =      "Cognitive Science",
-  volume =       "15",
-  pages =        "343--399",
-  year =         "1991",
-}
-
-@Article{Miller+Sachs83,
-  author =       "M. M. Miller and M. B. Sachs",
-  title =        "Representation of stop consonants in the discharge
-                 patterns of auditory nerve fibers",
-  journal =      jasa,
-  volume =       "74",
-  number =       "2",
-  pages =        "502--517",
-  year =         "1983",
-}
-
-@PhdThesis{miller02,
-  author =       "Erik G. Miller",
-  title =        "Learning from one example in machine vision by sharing
-                 probability densities",
-  school =       "Massachusetts Institute of Technology",
-  year =         "2002",
-}
-
-@PhdThesis{miller02one,
-  author =       "Erik Miller",
-  title =        "Learning from one example in machine vision by sharing
-                 probability densities",
-  school =       "Massachusetts Institute of Technology, Department of
-                 Electrical Engineering and Computer Science",
-  year =         "2002",
-}
-
-@InProceedings{Miller89,
-  author =       "G. F. Miller and P. M. Todd and S. U. Hegde",
-  editor =       "J. D. Schaffer",
-  booktitle =    "Proceedings of the Third International Conference on
-                 Genetic Algorithms",
-  title =        "Designing Neural Networks Using Genetic Algorithms",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Arlington 1989",
-  pages =        "379--384",
-  year =         "1989",
-}
-
-@Article{MillerD1996,
-  author =       "David Miller and Kenneth Rose",
-  title =        "Hierarchical, unsupervised learning with growing via
-                 phase transitions",
-  journal =      "Neural Computation",
-  volume =       "8",
-  number =       "2",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA, USA",
-  pages =        "425--450",
-  year =         "1996",
-  ISSN =         "0899-7667",
-}
-
-@Article{Miller-ijprai93,
-  author =       "C. B. Miller and C. L. Giles",
-  title =        "Experimental Comparison of the Effect of Order in
-                 Recurrent Neural Networks",
-  journal =      "Int. Journal of Pattern Recognition and Artificial
-                 Intelligence",
-  pages =        "205--228",
-  year =         "1993",
-  note =         "Special Issue on Applications of Neural Networks to
-                 Pattern Recognition (I. Guyon Ed.)",
-}
-
-@Book{Minc-88,
-  author =       "H. Minc",
-  title =        "Nonnegative Matrices",
-  publisher =    "John Wiley \& Sons",
-  address =      "New York",
-  year =         "1988",
-}
-
-@Book{Minsky67,
-  author =       "M. L. Minsky",
-  title =        "Computation: Finite and Infinite Machines",
-  publisher =    "Prentice-Hall",
-  address =      "Englewood Cliffs",
-  year =         "1967",
-}
-
-@Book{Minsky69,
-  author =       "M. L. Minsky and S. A. Papert",
-  title =        "Perceptrons",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1969",
-}
-
-@Article{Misra-1997,
-  author =       "Manavendra Misra",
-  title =        "Parallel Environments for Implementing Neural
-                 Networks",
-  journal =      "Neural Computing Surveys",
-  volume =       "1",
-  pages =        "48--60",
-  year =         "1997",
-}
-
-@Article{Mitchison89,
-  author =       "G. J. Mitchison and R. M. Durbin",
-  title =        "Bounds on the Learning Capacity of Some Multi-Layer
-                 Networks",
-  journal =      biocyb,
-  volume =       "60",
-  pages =        "345--356",
-  year =         "1989",
-}
-
-@Article{ML:Bauer:boost,
-  author =       "Eric Bauer and Ron Kohavi",
-  title =        "An empirical comparison of voting classification
-                 algorithms: Bagging, Boosting, and variants",
-  journal =      "Machine Learning",
-  year =         "1998",
-}
-
-@Article{ML:Breiman:bagging,
-  author =       "Leo Breiman",
-  title =        "Bagging Predictors",
-  journal =      "Machine Learning",
-  volume =       "24",
-  number =       "2",
-  pages =        "123--140",
-  year =         "1994",
-}
-
-@Article{ML:Dietterich:adaboost+noise,
-  author =       "Thomas G. Dietterich",
-  title =        "An experimental comparison of three methods for
-                 constructing ensembles of decision trees: Bagging,
-                 Boosting, and randomization",
-  journal =      "submitted to Machine Learning",
-  year =         "1998",
-  note =         "\\available at {\tt
-                 ftp://ftp.cs.orst.edu/pub/tgd/papers/tr-randomized-c4.ps.gz}",
-}
-
-@Article{ML:Schapire:weaklearn,
-  author =       "Robert E. Schapire",
-  title =        "The strength of weak learnability",
-  journal =      "Machine Learning",
-  volume =       "5",
-  number =       "2",
-  pages =        "197--227",
-  year =         "1990",
-}
-
-@Misc{MLJ-model-selection-combination-2001,
-  author =       "Y. Bengio and D. Schuurmans",
-  title =        "Special Issue on New methods for model selection and
-                 model combination",
-  year =         "2002",
-  note =         "{\em Machine Learning}, 48(1)",
-}
-
-@InProceedings{Mnih+Hinton-2007,
-  author =       "Andriy Mnih and Geoffrey E. Hinton",
-  booktitle =    ICML07,
-  editor =       ICML07ed,
-  publisher =    ICML07publ,
-  title =        "Three New Graphical Models for Statistical Language
-                 Modelling",
-  pages =        "641--648",
-  year =         "2007",
-}
-
-@InProceedings{Mnih+Hinton-2007-small,
-  author =       "Andriy Mnih and Geoffrey E. Hinton",
-  booktitle =    "ICML 2007",
-  title =        "Three New Graphical Models for Statistical Language
-                 Modelling",
-  year =         "2007",
-}
-
-@InProceedings{Mnih+Hinton-2009,
-  author =       "Andriy Mnih and Geoffrey E. Hinton",
-  booktitle =    NIPS21,
-  editor =       NIPS21ed,
-  title =        {A Scalable Hierarchical Distributed Language Model},
-  pages =        {1081--1088},
-  year =         "2009",
-}
-
-@InProceedings{mohri-pereira-riley96,
-  author =       "M. Mohri and F. C. N. Pereira and M. D. Riley",
-  booktitle =    "ECAI 96, 12th European Conference on Artificial
-                 Intelligence",
-  title =        "Weighted automata in text and speech processing",
-  pages =        "",
-  year =         "1996",
-}
-
-@Article{Mohri96,
-  author =       "M. Mohri",
-  title =        "Finite-State Transducers in Language and Speech
-                 Processing",
-  journal =      "Computational Linguistics",
-  volume =       "20",
-  number =       "1",
-  pages =        "1--33",
-  year =         "1996",
-}
-
-@InProceedings{Molina02,
-  author =       "A. Molina and F. Pla and E. Segarra and L. Moreno",
-  booktitle =    "{Proceedings of 3rd International Conference on
-                 Language Resources and Evaluation, LREC2002}",
-  title =        "{Word Sense Disambiguation using Statistical Models
-                 and {WordNet}}",
-  address =      "{Las Palmas de Gran Canaria, Spain}",
-  year =         "2002",
-}
-
-@PhdThesis{moller,
-  author =       "M. {Moller}",
-  title =        "Efficient Training of Feed-Forward Neural Networks",
-  school =       "Aarhus University",
-  address =      "Aarhus, Denmark",
-  year =         "1993",
-}
-
-@InProceedings{moller-92,
-  author =       "M. Moller",
-  booktitle =    "Neural Networks for Signal Processing 2",
-  title =        "supervised learning on large redundant training sets",
-  publisher =    "IEEE press",
-  year =         "1992",
-}
-
-@InProceedings{Momma2003,
-  author =       "M. Momma and K. P. Bennett",
-  booktitle =    colt03,
-  title =        "Sparse Kernel Partial Least Squares Regression",
-  year =         "2003",
-}
-
-@InProceedings{Montana89,
-  author =       "D. J. Montana and L. Davis",
-  editor =       "N. S. Sridharan",
-  booktitle =    "Eleventh International Joint Conference on Artificial
-                 Intelligence",
-  title =        "Training Feedforward Networks Using Genetic
-                 Algorithms",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Detroit 1989",
-  pages =        "762--767",
-  year =         "1989",
-}
-
-@InProceedings{Moody88,
-  author =       "J. Moody and C. Darken",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "Learning with Localized Receptive Fields",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "133--143",
-  year =         "1988",
-}
-
-@Article{Moody89,
-  author =       "J. Moody and C. Darken",
-  title =        "Fast Learning in Networks of Locally-Tuned Processing
-                 Units",
-  journal =      nc,
-  volume =       "1",
-  pages =        "281--294",
-  year =         "1989",
-}
-
-@InProceedings{Moody92,
-  author =       "J. E. Moody",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "The Effective Number of Parameters: An Analysis of
-                 Generalization and Regularization in Nonlinear Learning
-                 Systems",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "847--854",
-  year =         "1992",
-}
-
-@InProceedings{Moody92b,
-  author =       "J. Moody and J. Utans",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Principled architecture selection for neural networks:
-                 application to corporate bond rating prediction",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "683--690",
-  year =         "1992",
-}
-
-@Article{moody93,
-  author =       "J. Moody and U. Levin and S. Rehfuss",
-  title =        "Predicting the {U.S.} Index of Industrial Production",
-  journal =      "Neural Network World",
-  volume =       "3",
-  number =       "6",
-  pages =        "791--794",
-  year =         "1993",
-}
-
-@InCollection{Moody94,
-  author =       "J. Moody",
-  booktitle =    "From Statistics to Neural Networks: Theory and Pattern
-                 Recognition Applications",
-  title =        "Prediction Risk and Architecture Selection for Neural
-                 Networks",
-  publisher =    "Springer",
-  year =         "1994",
-}
-
-@InCollection{Moody98,
-  author =       "J. Moody",
-  editor =       "G. B. Orr and K-R. Muller",
-  booktitle =    "Neural Networks: Tricks of he Trade",
-  title =        "Forecasting the economy with neural nets: a survey of
-                 challenges",
-  publisher =    "Springer",
-  pages =        "347--372",
-  year =         "1998",
-}
-
-@InProceedings{Moore88,
-  author =       "B. Moore",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "{ART}1 and Pattern Clustering",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "174--185",
-  year =         "1988",
-}
-
-@InProceedings{MoosmannF2007,
-  author =       "Frank Moosmann and Bill Triggs and Frederic Jurie",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19ed,
-  title =        "Fast Discriminative Visual Codebooks using Randomized
-                 Clustering Forests",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "985--992",
-  year =         "2007",
-}
-
-@InCollection{More+Wu-1996,
-  author =       "Jorge More and Zhijun Wu",
-  editor =       "G. Di Pillo and F. Giannessi",
-  booktitle =    "Nonlinear Optimization and Applications",
-  title =        "Smoothing techniques for macromolecular global
-                 optimization",
-  publisher =    "Plenum Press",
-  year =         "1996",
-}
-
-@InProceedings{Morgan+Bourlard90b,
-  author =       "N. Morgan and H. Bourlard",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Generalization and parameter estimation in feedforward
-                 nets: some experiments",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "413--416",
-  year =         "1990",
-}
-
-@InProceedings{Morgan90,
-  author =       "N. Morgan and H. Bourlard",
-  booktitle =    icassp,
-  title =        "Continuous Speech Recognition Using Multilayer
-                 Perceptrons with Hidden {Markov} Models",
-  address =      "Albuquerque, NM",
-  pages =        "413--416",
-  year =         "1990",
-}
-
-@InProceedings{Morgan93,
-  author =       "M. Cohen and H. Franco and N. Morgan and D. Rumelhart
-                 and V. Abrash",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Context-Dependent Multiple Distribution Phonetic
-                 Modeling with {MLP}s",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "649--657",
-  year =         "1993",
-}
-
-@InProceedings{Morgan95,
-  author =       "N. Morgan and Y. Konig and S. L. Wu and H. Bourlard",
-  booktitle =    "Proceedings of IEEE Automatic Speech Recognition
-                 Workshop (Snowbird)",
-  title =        "Transition-based Statistical Training for {ASR}",
-  pages =        "133--134",
-  year =         "1995",
-}
-
-@InProceedings{Morin+Bengio-2005,
-  author =       "Fr\'ed\'eric Morin and Yoshua Bengio",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "Hierarchical Probabilistic Neural Network Language
-                 Model",
-  publisher =    "",
-  date =         "Jan 6-8, 2005",
-  location =     "Savannah Hotel, Barbados",
-  pages =        "246--252",
-  year =         "2005",
-}
-
-@Article{Mosesova-2006,
-  author =       "S. A. Mosesova and H. A. Chipman and R. J. MacKay and
-                 S. H. Steiner",
-  title =        "Profile monitoring using mixed effects models",
-  journal =      "Submitted to Technometrics",
-  year =         "2006",
-}
-
-@Article{MosesY1996,
-  author =       "Y. Moses and S. Ullman and S. Edelman",
-  title =        "Generalization to novel images in upright and inverted
-                 faces",
-  journal =      "Perception",
-  volume =       "25",
-  number =       "4",
-  pages =        "443--461",
-  year =         "1996",
-  OPTannote =    "",
-  OPTkey =       "",
-  OPTmonth =     "",
-  OPTnote =      "",
-}
-
-@Article{Movellan-2002,
-  author =       "Javier R. Movellan and Paul Mineiro and R. J. Williams",
-  title =        "A Monte-Carlo {EM} approach for partially observable
-                 diffusion processes: theory and applications to neural
-                 networks",
-  journal =      "Neural Computation",
-  volume =       "14",
-  pages =        "1501--1544",
-  year =         "2002",
-}
-
-@TechReport{Movelland+McClelland91,
-  author =       "Javier R. Movellan and James L. McClelland",
-  title =        "Learning Continuous Probability Distributions with the
-                 Contrastive {Hebbian} Algorithm",
-  number =       "PDP.CNS.91.2",
-  institution =  "Carnegie Mellon University, Dept. of Psychology",
-  address =      "Pittsburgh, PA",
-  year =         "1991",
-}
-
-@InCollection{Mozer+Smolensky89,
-  author =       "M. C. Mozer and P. Smolensky",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "Skeletonization: {A} technique for trimming the fat
-                 from a network via relabance assessment",
-  publisher =    "Morgan Kaufmann",
-  pages =        "107--115",
-  year =         "1989",
-}
-
-@InProceedings{Mozer-nips92,
-  author =       "M. C. Mozer",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "The induction of Multiscale Temporal Structure",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "275--282",
-  year =         "1992",
-}
-
-@Article{mozer-smolensky-89,
-  author =       "M. C. Mozer and P. Smolensky",
-  key =          "Mozer",
-  title =        "Using relevance to reduce network size automatically",
-  journal =      "Connection Science",
-  volume =       "1",
-  number =       "1",
-  pages =        "3--16",
-  year =         "1989",
-}
-
-@Article{Mozer-trnn2000,
-  author =       "M. C. Mozer and R. Wolniewicz and D. B. Grimes and E.
-                 Johnson and H. Kaushansky",
-  title =        "Predicting Subscriber Dissatisfaction and Improving
-                 Retention in the Wireless Telecommunications Industry",
-  journal =      "IEEE Transactions on Neural Networks, special issue on
-                 Data Mining and Knowledge Discovery",
-  volume =       "11",
-  number =       "3",
-  year =         "2000",
-}
-
-@Article{Mozer89,
-  author =       "M. C. Mozer",
-  title =        "A Focused Back-Propagation Algorithm for Temporal
-                 Pattern Recognition",
-  journal =      cs,
-  volume =       "3",
-  pages =        "349--381",
-  year =         "1989",
-}
-
-@InCollection{Mozer93,
-  author =       "M. C. Mozer",
-  editor =       "A. Weigend and N. Gershenfeld",
-  booktitle =    "Predicting the Future and Understanding the Past",
-  title =        "Neural net architectures for temporal sequence
-                 processing",
-  publisher =    "Addison-Wesley",
-  address =      "Redwood City, CA",
-  pages =        "243--264",
-  year =         "1993",
-}
-
-@TechReport{MPIforum,
-  author =       "Jack Dongarra and David Walker and {The Message
-                 Passing Interface Forum}",
-  title =        "{MPI}: {A} Message Passing Interface Standard",
-  number =       "http://www-unix.mcs.anl.gov/mpi",
-  institution =  "University of Tenessee",
-  year =         "1995",
-}
-
-@Article{multidimensional-FGS-83,
-  author =       "J. H. Friedman and E. Grosse and W. Suetzle",
-  title =        "Multidimensional additive spline approximation",
-  journal =      "SIAM Journal of Scientific and Statistical Computing",
-  volume =       "4",
-  number =       "2",
-  pages =        "291--301",
-  year =         "1983",
-}
-
-@InProceedings{Munro87,
-  author =       "P. Munro",
-  booktitle =    "The Ninth Annual Conference of the Cognitive Science
-                 Society",
-  title =        "A Dual Back-Propagation Scheme for Scalar Reward
-                 Learning",
-  publisher =    "Lawrence Erlbaum, Hillsdale",
-  address =      "Seattle 1987",
-  pages =        "165--176",
-  year =         "1987",
-}
-
-@InProceedings{MurraySal09,
-author=         "Iain Murray and Ruslan Salakhutdinov",
-title=          "Evaluating probabilities under high-dimensional latent variable models",
-editor =        NIPS21ed,
-booktitle=      NIPS21,
-volume=         "21",
-pages =         "1137--1144",
-year=           "2009"
-}
-
-@InProceedings{Murveit93,
-  author =       "H. Murveit and J. Butzberger and V. Digilakis and M.
-                 Weintraub",
-  booktitle =    icassp,
-  title =        "Large-vocabulary dictation using {SRI}'s {DECIPHER}
-                 speech recognition system: Progressive search
-                 techniques knowledge for continuous speech
-                 recognition",
-  address =      "Minneapolis, Minnesota",
-  pages =        "319--322",
-  year =         "1993",
-}
-
-@Article{Muselli97,
-  author =       "M. Muselli",
-  title =        "On convergence properties of pocket algorithm",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "8",
-  pages =        "623--629",
-  year =         "1997",
-}
-
-@article{Mutch-Lowe-2008,
- author = {Jim Mutch and David G. Lowe}, 
- title = {Object class recognition and localization using sparse features with limited receptive fields}, 
- journal = {International Journal of Computer Vision}, 
- volume = 80, 
- number = 1,
- year = 2008, 
- pages = {45--57},
-}
-
-@Article{myles90multiclass,
-  author =       "J. Myles and D. Hand",
-  title =        "The Multi-Class Measure Problem in Nearest Neighbour
-                 Discrimination Rules",
-  journal =      "Pattern Recognition",
-  volume =       "23",
-  pages =        "1291--1297",
-  year =         "1990",
-}
-
-@Article{Nadal86,
-  author =       "J.-P. Nadal and J.-P. Changeux G. Toulouse and S.
-                 Dehaene",
-  title =        "Networks of Formal Neurons and Memory Palimpsests",
-  journal =      eul,
-  volume =       "1",
-  pages =        "535--542",
-  year =         "1986",
-}
-
-@Article{Nadaraya64,
-  author =       "E. A. Nadaraya",
-  title =        "On estimating regression",
-  journal =      "Theory of Probability and its Applications",
-  volume =       "9",
-  pages =        "141--142",
-  year =         "1964",
-}
-
-@Article{Nadaraya65,
-  author =       "E. A. Nadaraya",
-  title =        "On nonparametric estimates of density functions and
-                 regression curves",
-  journal =      "Theory of Applied Probability",
-  volume =       "10",
-  pages =        "186--190",
-  year =         "1965",
-}
-
-@Article{Nadas85,
-  author =       "Arthur Nádas",
-  title =        "On {Turing's} Formula for Word Probabilities",
-  journal =      "IEEE Transactions on Acoustics, Speech, and Signal
-                 Processing",
-  volume =       "33",
-  number =       "6",
-  pages =        "1415--1417",
-  month =        dec,
-  year =         "1985",
-  copy =         yes,
-}
-
-@Article{Nadas85-small,
-  author =       "Arthur Nádas",
-  title =        "On {Turing's} Formula for Word Probabilities",
-  journal =      "ASSP",
-  volume =       "33",
-  number =       "6",
-  pages =        "1415--1417",
-  month =        dec,
-  year =         "1985",
-  copy =         yes,
-}
-
-@Article{Nadas88,
-  author =       "A. Nadas and D. Nahamoo and M. A. Picheny",
-  title =        "On a model-robust training method for speech
-                 recognition",
-  journal =      "IEEE Transactions on Acoustics, Speech and Signal
-                 Processing",
-  volume =       "ASSP-36",
-  number =       "9",
-  pages =        "1432--1436",
-  year =         "1988",
-}
-
-@Article{Nadeau-Bengio-2003,
-  author =       "Claude Nadeau and Yoshua Bengio",
-  title =        "Inference for the Generalization Error",
-  journal =      "Machine Learning",
-  volume =       "52",
-  number =       "3",
-  pages =        "239--281",
-  year =         "2003",
-}
-
-@Article{Nadeau-Bengio-2003-small,
-  author =       "Claude Nadeau and Yoshua Bengio",
-  title =        "Inference for the Generalization Error",
-  journal =      "Machine Learning",
-  volume =       "52(3)",
-  pages =        "239--281",
-  year =         "2003",
-}
-
-@InProceedings{Nadeau00-nips,
-  author =       "Claude Nadeau and Yoshua Bengio",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Inference for the Generalization Error",
-  publisher =    "MIT Press",
-  pages =        "307--313",
-  year =         "2000",
-}
-
-@InProceedings{Bonneville+al-1998,
-  author =       "M. Bonneville and J. Meunier and Y. Bengio and J.P. Soucy",
-  booktitle =    "SPIE Medical Imaging 1998",
-  title =        "Support Vector Machines for Improving the classification of Brain Pet Images",
-  address =      "San Diego",
-  year =         "1998",
-}
-
-@TechReport{Nadeau99-TR,
-  author =       "Claude Nadeau and Yoshua Bengio",
-  title =        "Inference for the Generalization Error",
-  institution =  "CIRANO",
-  address =      "Montreal, Quebec, Canada",
-  year =         "1999",
-}
-
-@InProceedings{nag86,
-  author =       "R. Nag and K. H. Wong and F. Fallside",
-  booktitle =    icassp,
-  title =        "Script recognition using hidden {Markov} models",
-  address =      "Tokyo",
-  pages =        "2071--2074",
-  year =         "1986",
-}
-
-@MastersThesis{Nahm-2005,
- author = {E. Nahm},
- title = {Classification models for transactional graph data},
- school = {Department of Mathematics and Statistics, Acadia University},
- year = 2005,
-}
-
-@article{Naka-Rushton-1966a,
- author = {K.I. Naka and W.A.H. Rushton},
- year = 1966,
- title = {{S}-potentials from colour units in the retina of fish (Cyprinidae)},
- journal = {J. Physiol.}, 
- volume = 185, 
- pages = {536-–555},
-}
-
-@article{Naka-Rushton-1966b,
- author = {K.I. Naka and W.A.H. Rushton},
- year = 1966,
- title = {An attempt to analyse colour perception by electrophysiology},
- journal = {J. Physiol.}, 
- volume = 185, 
- pages = {556–586},
-}
-
-
-@InProceedings{NakagawaT04,
-  author =       "Tetsuji Nakagawa and Taku Kudoh and Yuji Matsumoto",
-  booktitle =    "Proceedings of the Sixth Natural Language Processing
-                 Pacific Rim Symposium",
-  title =        "Unknown Word Guessing and Part-of-Speech Tagging Using
-                 Support Vector Machines",
-  address =      "Tokyo, Japan",
-  pages =        "325--331",
-  year =         "2001",
-}
-
-@Article{Naradraya70,
-  author =       "E. A. Nadaraya",
-  title =        "Remarks on nonparametric estimates for density
-                 functions and regression curves",
-  journal =      "Theory of Probability and its Applications",
-  volume =       "15",
-  pages =        "134--137",
-  year =         "1970",
-}
-
-@Book{Narendra89,
-  author =       "K. Narendra and M. A. L. Thathachar",
-  title =        "Learning Automata: An Introduction",
-  publisher =    "Prentice-Hall",
-  address =      "Englewood Cliffs",
-  year =         "1989",
-}
-
-@Book{narendra:1989,
-  author =       "K. S. Narendra and M. A. L. Thathachar",
-  title =        "Learning Automata: an introduction",
-  publisher =    "Prentice Hall",
-  year =         "1989",
-}
-
-@Article{Nasrabadi88a,
-  author =       "N. M. Nasrabadi and R. A. King",
-  title =        "Image Coding Using Vector Quantization: {A} Review",
-  journal =      ieeetcomm,
-  volume =       "36",
-  pages =        "957--971",
-  year =         "1988",
-}
-
-@InProceedings{Nasrabadi88b,
-  author =       "N. M. Nasrabadi and Y. Feng",
-  booktitle =    icnn,
-  title =        "Vector Quantization of Images Based upon the Kohonen
-                 Self-Organizing Feature Maps",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "101--108",
-  year =         "1988",
-}
-
-@Article{Nass75,
-  author =       "M. M. Nass and L. N. Cooper",
-  title =        "A Theory for the Development of Feature Detecting
-                 Cells in Visual Cortex",
-  journal =      biocyb,
-  volume =       "19",
-  pages =        "1--18",
-  year =         "1975",
-}
-
-@Article{Naylor88,
-  author =       "J. Naylor and K. P. Li",
-  title =        "Analysis of a Neural Network Algorithm for Vector
-                 Quantization of Speech Parameters",
-  journal =      nnsupp,
-  volume =       "1",
-  pages =        "310",
-  year =         "1988",
-}
-
-@Article{NC:Baldi93,
-  author =       "P. Baldi and Y. Chauvin",
-  title =        "Neural Networks for Fingerprint Recognition",
-  journal =      "Neural Computation",
-  volume =       "5",
-  type =         "Letter",
-  number =       "3",
-  pages =        "402--418",
-  year =         "1993",
-}
-
-@Article{nc:Geman+Bienenstock+Doursat:1992,
-  author =       "S. Geman and E. Bienenstock and R. Doursat",
-  title =        "Neural Networks and the Bias/Variance Dilemma",
-  journal =      "Neural Computation",
-  volume =       "4",
-  type =         "View",
-  number =       "1",
-  pages =        "1--58",
-  year =         "1992",
-}
-
-@Article{nc:Poggio+Girosi:1998,
-  author =       "Tomaso Poggio and Frederico Girosi",
-  title =        "A Sparse Representation for Function Approximation",
-  journal =      "Neural Computation",
-  volume =       "10",
-  number =       "6",
-  pages =        "1445--1454",
-  year =         "1998",
-}
-
-@TechReport{Neal-GP97,
-  author =       "Radford M. Neal",
-  title =        "Monte Carlo implementation of {G}aussian process models
-                 for {Bayesian} regression and classification",
-  number =       "9702",
-  institution =  "University of Toronto, Department of Statistics",
-  year =         "1997",
-}
-
-@Article{Neal92,
-  author =       "Radford M. Neal",
-  title =        "Connectionist learning of belief networks",
-  journal =      "Artificial Intelligence",
-  volume =       "56",
-  pages =        "71--113",
-  year =         "1992",
-}
-
-@InProceedings{Neal93a,
-  author =       "Radford M. Neal",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Bayesian learning via stochastic dynamics",
-  address =      "Denver, CO",
-  pages =        "475--482",
-  year =         "1993",
-}
-
-@TechReport{Neal93b,
-  author =       "Radford M. Neal",
-  title =        "Probabilistic inference using {Markov} chain
-                 {Monte-Carlo} methods",
-  number =       "{CRG-TR}-93-1",
-  institution =  "Dept. of Computer Science, University of Toronto",
-  year =         "1993",
-}
-
-@PhdThesis{Neal94,
-  author =       "Radford M. Neal",
-  title =        "Bayesian Learning for Neural Networks",
-  school =       "Dept. of Computer Science, University of Toronto",
-  year =         "1994",
-}
-
-@TechReport{Neal94b,
-  author =       "Radford M. Neal",
-  title =        "Sampling from Multimodal Distributions Using Tempered Transitions",
-  number =       "9421",
-  institution =  "Dept. of Statistics, University of Toronto",
-  year =         "1994",
-}
-
-@InCollection{Neal98,
-  author =       "Radford M. Neal",
-  editor =       "C. M. Bishop",
-  booktitle =    "Neural Networks and Machine Learning",
-  title =        "Assessing relevance determination methods using
-                 {DELVE}",
-  publisher =    "Springer-Verlag",
-  pages =        "97--129",
-  year =         1998,
-}
-
-@Misc{neal98assessing,
-  author =       "Radford M. Neal",
-  title =        "Assessing Relevance Determination Methods Using
-                 {DELVE} Generalization in Neural Networks and Machine
-                 Learning",
-  year =         "1998",
-  text =         "Neal, R. N. (1998). Assessing Relevance Determination
-                 Methods Using DELVE Generalization in Neural Networks
-                 and Machine Learning, C. M. Bishop (editor),
-                 SpringerVerlag.",
-}
-
-@article{Neal-2001,
-  author =      "Radford M. Neal",
-  journal =     "Statistics and Computing",
-  month =       "April",
-  number =      "2",
-  pages =       "125--139",
-  title =       "Annealed importance sampling",
-  url =         "http://dx.doi.org/10.1023/A:1008923215028",
-  volume =      "11",
-  year =        "2001"
-}
-
-@Article{Needleman+Wunsch70,
-  author =       "S. B. Needleman and C. D. Wunsch",
-  title =        "A general method applicable to the search of
-                 similarities in the amino acid sequence of two
-                 proteins",
-  journal =      "Journal of Molecular Biology",
-  volume =       "48",
-  pages =        "443--453",
-  year =         "1970",
-}
-
-@Article{NeweyWest1987,
-  author =       "W. Newey and K. West",
-  title =        "A Simple, Positive Semi-Definite, Heteroscedasticity
-                 and Autocorrelation Consistent Covariance Matrix",
-  journal =      "Econometrica",
-  volume =       "55",
-  pages =        "703--708",
-  year =         "1987",
-}
-
-@InProceedings{Ney+Kneser93,
-  author =       "Hermann Ney and Reinhard Kneser",
-  booktitle =    "European Conference on Speech Communication and
-                 Technology (Eurospeech)",
-  title =        "Improved clustering techniques for class-based
-                 statistical language modelling",
-  address =      "Berlin",
-  pages =        "973--976",
-  year =         "1993",
-}
-
-@Article{Ney92,
-  author =       "H. Ney and D. Mergel and A. Noll and A. Paesler",
-  title =        "Data driven search organization for continuous speech
-                 recognition",
-  journal =      "IEEE Transactions on Signal Processing",
-  volume =       "40",
-  number =       "2",
-  pages =        "272--281",
-  month =        feb,
-  year =         "1992",
-}
-
-@InProceedings{Ng1996,
-  author =       "Hwee Tou Ng and Hian Beng Lee",
-  editor =       "Arivind Joshi and Martha Palmer",
-  booktitle =    "Proceedings of the Thirty-Fourth Annual Meeting of the
-                 Association for Computational Linguistics",
-  title =        "Integrating Multiple Knowledge Sources to Disambiguate
-                 Word Sense: An Exemplar-Based Approach",
-  publisher =    "Morgan Kaufmann Publishers",
-  address =      "San Francisco",
-  pages =        "40--47",
-  year =         "1996",
-  URL =          "citeseer.nj.nec.com/ng96integrating.html",
-}
-
-@InProceedings{Ng1997,
-  author =       "Hwee Tou Ng",
-  booktitle =    SIGLEX97,
-  title =        "Getting Serious about Word Sense Disambiguation",
-  address =      "Washington",
-  pages =        "1--7",
-  year =         "1997",
-}
-
-@InProceedings{Ng2002,
-  author =       "Andrew Y. Ng and Michael I. Jordan and Yair Weiss",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "On Spectral Clustering: analysis and an algorithm",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-  original =     "orig/AA35.ps",
-}
-
-@InProceedings{Ng2008,
-  author =       "Honglak Lee and Ekanadham Chaitanya and Andrew Y. Ng",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Sparse deep belief net model for visual area {V2}",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2008",
-}
-
-@InProceedings{NgJ02,
-  author =       "Andrew Y. Ng and Michael I. Jordan",
-  booktitle =    NIPS14,
-  editor =       NIPS14ed,
-  title =        {On Discriminative vs. Generative Classifiers: A
-                 comparison of logistic regression and naive Bayes},
-  pages =        "841--848",
-  year =         "2002",
-}
-
-%%Fred I deprecate the following as the tag name have the year of the conf and not of the papers!
-@InProceedings{NgJ01,
-  author =       "Andrew Y. Ng and Michael I. Jordan",
-  booktitle =    NIPS14,
-  editor =       NIPS14ed,
-  title =        {On Discriminative vs. Generative Classifiers: A
-                 comparison of logistic regression and naive Bayes},
-  pages =        "841--848",
-  year =         "2002",
-}
-
-@InProceedings{Nie99,
-  author =       "J. Y. Nie and M. Simard and P. Isabelle and R.
-                 Durand",
-  booktitle =    "22nd ACM-SIGIR",
-  title =        "Cross-Language Information Retrieval based on Parallel
-                 Texts and Automatic Mining of Parallel Texts in the
-                 Web",
-  address =      "Berkeley",
-  pages =        "74--81",
-  year =         "1999",
-}
-
-@INPROCEEDINGS{Niebles+Fei-Fei-2007,
-  AUTHOR =       "Niebles, J.C. and Fei-Fei, L.",
-  TITLE =        "A hierarchical model of shape and appearance for human action classification. ",
-  BOOKTITLE =    cvpr07,
-  YEAR =         "2007",
-}
-
-@Article{Nielsen96,
-  author =       "H. Nielsen and J. Engelbrecht and G. {von Heijne} and
-                 S. Brunak",
-  title =        "Defining a similarity threshold for a functional
-                 protein sequence pattern: the signal peptide cleavage
-                 site",
-  journal =      "Proteins",
-  pages =        "316--320",
-  year =         "1996",
-  volme =        "24",
-}
-
-@Article{Nielsen97,
-  author =       "H. Nielsen and J. Engelbrecht and S. Brunak and G.
-                 {von Heijne}",
-  title =        "Identification of prokaryotic and eukaryotic signal
-                 peptides and prediction of their cleavage sites",
-  journal =      "Prot. Eng.",
-  pages =        "1--6",
-  year =         "1997",
-  volme =        "10",
-}
-
-@InProceedings{Niesler98,
-  author =       "T. R. Niesler and E. W. D. Whittaker and P. C.
-                 Woodland",
-  booktitle =    icassp,
-  title =        "Comparison of part-of-speech and automatically derived
-                 category-based language models for speech recognition",
-  pages =        "177--180",
-  year =         "1998",
-}
-
-@InProceedings{Niles90,
-  author =       "L. T. Niles and H. F. Silverman",
-  booktitle =    icassp,
-  title =        "Combining Hidden {Markov} Models and Neural Network
-                 Classifiers",
-  address =      "Albuquerque, NM",
-  pages =        "417--420",
-  year =         "1990",
-}
-
-@Book{Nilsson-65,
-  author =       "N. J. Nilsson",
-  title =        "Learning Machines",
-  publisher =    "McGraw-Hill",
-  address =      "New York",
-  year =         "1965",
-}
-
-@Book{Nilsson-71,
-  author =       "N. J. Nilsson",
-  title =        "Problem-Solving Methods in Artificial Intelligence",
-  publisher =    "McGraw-Hill",
-  address =      "New York",
-  year =         "1971",
-}
-
-@InProceedings{nips-10:Baxter+Bartlett:1998,
-  author =       "Jonathan Baxter and Peter Bartlett",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "The Canonical Distortion Measure in Feature Space and
-                 1-{NN} Classification",
-  publisher =    "MIT Press",
-  year =         "1998",
-}
-
-@InProceedings{nips-10:Holger+Yoshua:1998,
-  author =       "Holger Schwenk and Yoshua Bengio",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "Training Methods for Adaptive Boosting of Neural
-                 Networks",
-  publisher =    "MIT Press",
-  pages =        "647--653",
-  year =         "1998",
-}
-
-@InProceedings{nips-6:Perrone:1994,
-  author =       "Michael P. Perrone",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Putting It All Together: Methods for Combining Neural
-                 Networks",
-  publisher =    "Morgan Kaufmann Publishers, Inc.",
-  pages =        "1188--1189",
-  year =         "1994",
-}
-
-@InProceedings{nips-9:Burges+Schoelkopf:1997,
-  author =       "Chris J. C. Burges and B. Sch{\"o}lkopf",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "Improving the Accuracy and Speed of Support Vector
-                 Machines",
-  publisher =    "MIT Press",
-  pages =        "375",
-  year =         "1997",
-}
-
-@InProceedings{nips02-LT09,
-  author =       "G. Lebanon and J. Lafferty",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Boosting and Maximum Likelihood for Exponential
-                 Models",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-  original =     "orig/LT09.ps",
-}
-
-@InCollection{NIPS2005-207,
-  author =       "Jian Zhang and Zoubin Ghahramani and Yiming Yang",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Learning Multiple Related Tasks using Latent
-                 Independent Component Analysis",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "1587--1594",
-  year =         "2006",
-}
-
-@InCollection{NIPS2007-812-small,
-  author =       "Nicolas Chapados and Yoshua Bengio",
-  booktitle =    "NIPS 20",
-  title =        "Augmented Functional Time Series Representation and
-                 Forecasting with {G}aussian Processes",
-  pages =        "265--272",
-  year =         "2008",
-}
-
-@InCollection{NIPS2007-925-small,
-  author =       "Nicolas {Le Roux} and Yoshua Bengio and Pascal Lamblin
-                 and Marc Joliveau and Balazs Kegl",
-  booktitle =    "NIPS 20",
-  title =        "Learning the 2-{D} Topology of Images",
-  pages =        "841--848",
-  year =         "2008",
-}
-
-@InProceedings{NIPS8:Drucker:AdaBoost-Trees,
-  author =       "Harris Drucker and Corinna Cortes",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Boosting decision trees",
-  publisher =    "MIT Press",
-  pages =        "479--485",
-  year =         "1996",
-}
-
-@InProceedings{NIPS8:Hofmann-Tresp,
-  author =       "Reimar Hofmann and Volker Tresp",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Discovering structure in continuous variables using
-                 {Bayesian} networks",
-  publisher =    "MIT Press",
-  pages =        "500--506",
-  year =         "1996",
-}
-
-@InProceedings{NIPS9:Monti-Cooper,
-  author =       "Stefano Monti and Gregory F. Cooper",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "Learning {Bayesian} belief networks with neural
-                 network estimators",
-  publisher =    "MIT Press",
-  pages =        "578--584",
-  year =         "1997",
-}
-
-@Article{Niranjan90,
-  author =       "M. Niranjan and F. Fallside",
-  title =        "Neural Networks and Radial Basis Functions in
-                 Classifying Static Speech Patterns",
-  journal =      cspla,
-  volume =       "4",
-  pages =        "275--289",
-  year =         "1990",
-}
-
-@Article{Nishimori90,
-  author =       "H. Nishimori and T. Nakamura and M. Shiino",
-  title =        "Retrieval of Spatio-Temporal Sequence in Asynchronous
-                 Neural Network",
-  journal =      prA,
-  volume =       "41",
-  pages =        "3346--3354",
-  year =         "1990",
-}
-
-@book{Nixon+Aguado+2007,
-    author = {Nixon, M. S.  and Aguado, A. S. },
-    publisher = {Academic Press},
-    edition = 2,
-    title = {Feature Extraction and Image Processing},
-    year = {2007}
-}
-
-@Article{nonparametric-LZ-95,
-  author =       "G. Lugosi and K. Xeger",
-  title =        "Nonparametric Estimation via Empirical Risk
-                 Minimization",
-  journal =      "IEEE Trans. on Information Theory",
-  volume =       "41",
-  number =       "3",
-  pages =        "677--687",
-  year =         "1995",
-}
-
-@Article{nonparametric-SK-96,
-  author =       "M. Smith and R. Kohn",
-  title =        "Nonparametric regression using {Bayesian} variable
-                 selection",
-  journal =      "J.Econometrics",
-  volume =       "75",
-  pages =        "317--344",
-  year =         "1996",
-}
-
-@InProceedings{nonparametric-W-91,
-  author =       "H. White",
-  booktitle =    "Proceedings of 23rd Symposium on the Interface,
-                 Computer Science and Statistics",
-  title =        "Nonparametric Estimation of Conditional Quantiles
-                 Using Neural Networks",
-  publisher =    "New-York: Springer-Verlag",
-  pages =        "190--199",
-  year =         "1992",
-}
-
-@Article{NordStrom,
-  author =       "T. Nordstrom and B. Svensson",
-  title =        "Using and Designing Massively Parallel Computers for
-                 Artificial Neural Networks",
-  journal =      "Journal of Parallel and Distributed Computing",
-  volume =       "3",
-  number =       "14",
-  pages =        "260--285",
-  year =         "1992",
-  OPTnote =      "",
-}
-
-@Article{Normandin94,
-  author =       "Y. Normandin and R. Cardin and R. {DeMori}",
-  title =        "High-performance connected digit recognition using
-                 maximum mutual information estimation",
-  journal =      "Transactions on Speech and Audio Processing",
-  volume =       "2",
-  number =       "2",
-  pages =        "299--311",
-  year =         "1994",
-}
-
-@InProceedings{Nowlan-nips90,
-  author =       "S. J. Nowlan",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Maximum Likelihood Competitive Learning",
-  publisher =    "Morgan Kaufman Publishers",
-  address =      "San Mateo, CA",
-  pages =        "574--582",
-  year =         "1990",
-}
-
-@InProceedings{Nowlan-nips92,
-  author =       "S. J. Nowlan and G. E. Hinton",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Adaptive Soft Weight Tying using {G}aussian Mixtures",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "993--1000",
-  year =         "1992",
-}
-
-@PhdThesis{Nowlan-PhD,
-  author =       "S. J. Nowlan",
-  title =        "Soft Competitive Adaptation: Neural Network Learning
-                 Algorithms based on Fitting Statistical Mixtures",
-  type =         "{C}{M}{U}-{C}{S}-91-126",
-  school =       "School of Computer Science, Carnegie Mellon
-                 University",
-  address =      "Pittsburgh, PA",
-  month =        apr # " 14",
-  year =         "1991",
-}
-
-@Article{Nowlan88,
-  author =       "S. J. Nowlan",
-  title =        "Gain Variation in Recurrent Error Propagation
-                 Networks",
-  journal =      cs,
-  volume =       "2",
-  pages =        "305--320",
-  year =         "1988",
-}
-
-@TechReport{Nowlan90,
-  author =       "S. J. Nowlan",
-  key =          "Nowlan",
-  title =        "Competing Experts: {An} experimental investigation of
-                 associative mixture models",
-  type =         "Technical Report",
-  number =       "CRG-TR-90-5",
-  institution =  "University of Toronto",
-  year =         "1990",
-  annote =       "In CRG Library",
-}
-
-@Article{Nowlan92,
-  author =       "S. J. Nowlan and G. E. Hinton",
-  title =        "Simplifying Neural Networks by Soft Weight-Sharing",
-  journal =      "Neural Computation",
-  volume =       "4",
-  type =         "Letter",
-  number =       "4",
-  pages =        "473--493",
-  year =         "1992",
-}
-
-@InProceedings{nsvnijcnn,
-  author =       "Pascal Vincent and Yoshua Bengio",
-  booktitle =    ijcnn,
-  title =        "A Neural Support Vector Network Architecture with
-                 Adaptive Kernels",
-  volume =       "5",
-  pages =        "5187--5192",
-  year =         "2000",
-}
-
-@Book{NumOptBook,
-  author =       "J. Nocedal and S. Wright",
-  title =        "Numerical Optimization",
-  publisher =    "Springer",
-  year =         "2006",
-}
-
-@Article{Nystrom-1928,
-  author =       "E. J. Nystr{\"o}m",
-  title =        "{\"{U}}ber die praktische aufl{\"o}sung von linearen
-                 integralgleichungen mit anwendungen auf
-                 randwertaufgaben der potentialtheorie",
-  journal =      "Commentationes Physico-Mathematicae",
-  volume =       "4",
-  number =       "15",
-  pages =        "1--52",
-  year =         "1928",
-}
-
-@Book{O'Shaughnessy87,
-  author =       "D. O'Shaughnessy",
-  title =        "Speech Communication --- Human and Machine",
-  publisher =    "Addison-Wesley",
-  year =         "1987",
-}
-
-@Article{Oja82,
-  author =       "E. Oja",
-  title =        "A Simplified Neuron Model As a Principal Component
-                 Analyzer",
-  journal =      jmathb,
-  volume =       "15",
-  pages =        "267--273",
-  year =         "1982",
-}
-
-@Article{Oja85,
-  author =       "E. Oja and J. Karhunen",
-  title =        "On Stochastic Approximation of the Eigenvectors and
-                 Eigenvalues of the Expectation of a Random Matrix",
-  journal =      jama,
-  volume =       "106",
-  pages =        "69--84",
-  year =         "1985",
-}
-
-@Article{Oja89,
-  author =       "E. Oja",
-  title =        "Neural Networks, Principal Components, and Subspaces",
-  journal =      "International Journal of Neural Systems",
-  volume =       "1",
-  pages =        "61--68",
-  year =         "1989",
-}
-
-@Article{Olshausen+Field-1996,
-  author =       "Bruno A. Olshausen and David J. Field",
-  title =        {Emergence of simple-cell receptive field properties by learning a sparse code for natural images},
-  journal =      "Nature",
-  volume =       381,
-  pages =        {607--609},
-  year =         "1996",
-}
-
-@Article{Olshausen-97,
-  author =       "B. A. Olshausen and D. J. Field",
-  title =        "Sparse coding with an overcomplete basis set: a
-                 strategy employed by {V}1?",
-  journal =      "Vision Research",
-  volume =       "37",
-  pages =        "3311--3325",
-  year =         "1997",
-  url =          {http://view.ncbi.nlm.nih.gov/pubmed/9425546},
-  keywords = {sparse-coding, v1, vision},
-  month = {December},
-}
-
-@article{olshausen:2005,
-    author = {Bruno Olshausen and David J. Field},
-    title = {How Close are We to Understanding {V1}?},
-    journal = {Neural Computation},
-    volume = {17},
-    pages = {1665-1699},
-    year = {2005},
-}
-
-
-@InProceedings{Omlin-ml92,
-  author =       "C. W. Omlin and C. L. Giles",
-  editor =       "D. Sleeman and P. Edwards",
-  booktitle =    "Machine Learning: Proc. of the Ninth Int. Conference",
-  title =        "Training Second-Order Recurrent Neural Networks using
-                 Hints",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  year =         "1992",
-}
-
-@InProceedings{Omohundro96,
-  author =       "S. Omohundro",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Family Discovery",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@InProceedings{Ong-Smola-2003,
-  author =       "C. S. Ong and A. J. Smola",
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  title =        "Machine learning using hyperkernels",
-  year =         "2003",
-}
-
-@Article{Opper90,
-  author =       "M. Opper and W. Kinzel and J. Kleinz and R. Nehl",
-  title =        "On the Ability of the Optimal Perceptron to
-                 Generalize",
-  journal =      jpa,
-  volume =       "23",
-  pages =        "L581--L586",
-  year =         "1990",
-}
-
-@Article{Orland85,
-  author =       "H. Orland",
-  title =        "Mean-Field Theory for Optimization Problems",
-  journal =      jppl,
-  volume =       "46",
-  pages =        "763--770",
-  year =         "1985",
-}
-
-@InProceedings{ormo-nips99,
-  author =       "D. Ormoneit and T. Hastie",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Optimal Kernel Shapes for Local Linear Regression",
-  publisher =    "MIT Press",
-  year =         "2000",
-}
-
-@Article{Orponen94,
-  author =       "Pekka Orponen",
-  title =        "Computational complexity of neural networks: a
-                 survey",
-  journal =      "Nordic Journal of Computing",
-  volume =       "1",
-  number =       "1",
-  pages =        "94--110",
-  month =        "Spring",
-  year =         "1994",
-  URL =          "citeseer.ist.psu.edu/article/orponen95computational.html",
-}
-
-@Book{Ortega70,
-  author =       "J. M. Ortega and W. C. Rheinboldt",
-  title =        "Iterative Solution of Non-linear Equations in Several
-                 Variables and Systems",
-  publisher =    "Academic Press",
-  address =      "New York",
-  year =         "1970",
-  OPTnote =      "",
-}
-
-@Book{Ortega70a,
-  author =       "J. M. Ortega and W. C. Rheinboldt",
-  title =        "Iterative Solution of Non-linear Equations in Several
-                 Variables and Systems",
-  publisher =    "Academic Press",
-  address =      "New York",
-  year =         "1970",
-}
-
-@InProceedings{Osindero+Hinton-2008,
-  author =       "Simon Osindero and Geoffrey E. Hinton",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        {Modeling image patches with a directed hierarchy of
-                 Markov random field},
-  publisher =    {MIT Press},
-  address =      {Cambridge, MA},
-  pages =        {1121--1128},
-  year =         "2008",
-}
-
-@InProceedings{Osindero+Hinton-2008-small,
-  author =       "S. Osindero and G. Hinton",
-  booktitle =    "NIPS 20",
-  title =        {Modeling image patches with a directed hierarchy of
-                 Markov random field},
-  year =         "2008",
-}
-
-@Article{Osindero+Welling+Hinton-05,
-  author =       "Simon Osindero and Max Welling and Geoffrey E. Hinton",
-  title =        "Topographic Product Models Applied To Natural Scene
-                 Statistics",
-  journal =      "Neural Computation",
-  volume =       "18",
-  pages =        "381--344",
-  year =         "2005",
-}
-
-@Article{OsinderoS2006,
-  author =       "Simon Osindero and Max Welling and Geoffrey E.
-                 Hinton",
-  title =        "Topographic Product Models Applied to Natural Scene
-                 Statistics",
-  journal =      "Neural Computation",
-  volume =       "18",
-  number =       "2",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA, USA",
-  pages =        "381--414",
-  year =         "2006",
-  ISSN =         "0899-7667",
-}
-
-@Article{OsinderoS2006-small,
-  author =       "Simon Osindero and Max Welling and Geoffrey E. Hinton",
-  title =        "Topographic Product Models Applied to Natural Scene
-                 Statistics",
-  journal =      "Neural Computation",
-  volume =       "18",
-  number =       "2",
-  pages =        "381--414",
-  year =         "2006",
-}
-
-@InProceedings{Ott76,
-  author =       "R. Ott",
-  booktitle =    "Third International Joint Conference on Pattern
-                 Recognition",
-  title =        "Construction of quadratic polynomial classifiers",
-  publisher =    "IEEE, CA",
-  address =      "Coronado, CA",
-  pages =        "161--165",
-  year =         "1976",
-}
-
-@article{OttJ1976b,
-     title = {Some Classification Procedures for Multivariate Binary Data Using Orthogonal Functions},
-     author = {Ott, Jurg and Kronmal, Richard A.},
-     journal = {Journal of the American Statistical Association},
-     volume = {71},
-     number = {354},
-     pages = {391--399},
-     year = {1976},
-     publisher = {American Statistical Association},    
-     copyright = {Copyright © 1976 American Statistical Association},
-    }
-
-
-@InProceedings{Ouimet+Bengio-2005,
-  author =       "Marie Ouimet and Yoshua Bengio",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "Greedy Spectral Embedding",
-  publisher =    "",
-  date =         "Jan 6-8, 2005",
-  location =     "Savannah Hotel, Barbados",
-  pages =        "253--260",
-  year =         "2005",
-}
-
-@InProceedings{Owens89,
-  author =       "A. J. Owens and D. L. Filkin",
-  booktitle =    ijcnn,
-  title =        "Efficient Training of the Back Propagation Network by
-                 Solving a System of Stiff Ordinary Differential
-                 Equations",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "381--386",
-  year =         "1989",
-}
-
-@InProceedings{Paccanaro2000,
-  author =       "A. Paccanaro and G. E. Hinton",
-  booktitle =    ijcnn,
-  title =        "Extracting Distributed Representations of Concepts and
-                 Relations from Positive and Negative Propositions",
-  publisher =    "IEEE, New York",
-  address =      "Como, Italy",
-  year =         "2000",
-}
-
-@Article{Packard80,
-  author =       "N. H. Packard and J. P Crutchfield and J. D. Farmer
-                 and R. S. Shaw",
-  title =        "Geometry from a Time Series",
-  journal =      prl,
-  volume =       "45",
-  pages =        "712--716",
-  year =         "1980",
-}
-
-@misc{Pal+al-2006,
-    author = {Chris Pal and Michael Kelm and Xuerui Wang and Greg Druck and Andrew McCallum},
-    title = {On Discriminative and Semi-Supervised Dimensionality Reduction},
-    year = {2006},
-    note = {Workshop on Novel Applications of Dimensionality Reduction, NIPS'06},
-}
-
-@InCollection{Palmer88,
-  author =       "R. G. Palmer",
-  editor =       "P. W. Anderson and K. J. Arrow and D. Pines",
-  booktitle =    "The Economy As an Evolving Complex System",
-  title =        "Statistical Mechanics Approaches to Complex
-                 Optimization Problems",
-  volume =       "5",
-  publisher =    "Addison-Wesley",
-  address =      "Redwood City",
-  pages =        "177--193",
-  year =         "1988",
-  series =       "SFI Studies in the Sciences of Complexity:
-                 Proceedings",
-}
-
-@InCollection{Palmer89,
-  author =       "R. G. Palmer",
-  editor =       "D. L. Stein",
-  booktitle =    "Lectures in the Sciences of Complexity",
-  title =        "Neural Nets",
-  volume =       "1",
-  publisher =    "Addison-Wesley",
-  address =      "Redwood City",
-  pages =        "439--461",
-  year =         "1989",
-  series =       "SFI Studies in the Sciences of Complexity: Lectures",
-}
-
-@Book{Papadimitriou,
-  author =       "C. H. Papadimitriou",
-  title =        "Combinatorial Optimization: Algorithms and
-                 Complexity",
-  publisher =    "Prentice-Hall",
-  address =      "Englewood Cliffs, NJ",
-  year =         "1982",
-}
-
-@Book{Papadimitriou82,
-  author =       "C. H. Papadimitriou and K. Steiglitz",
-  title =        "Combinatorial Optimization: Algorithms and
-                 Complexity",
-  publisher =    "Prentice-Hall",
-  address =      "Englewood Cliffs",
-  year =         "1982",
-}
-
-@Article{Parga86,
-  author =       "N. Parga and M. A. Virasoro",
-  title =        "The Ultrametric Organization of Memories in a Neural
-                 Network",
-  journal =      jpp,
-  volume =       "47",
-  pages =        "1857--1864",
-  year =         "1986",
-}
-
-@Article{Parisi86,
-  author =       "G. Parisi",
-  title =        "Asymmetric Neural Networks and the Process of
-                 Learning",
-  journal =      jpa,
-  volume =       "19",
-  pages =        "L675--L680",
-  year =         "1986",
-}
-
-@Book{Parisi88,
-  author =       "G. Parisi",
-  title =        "Statistical Field Theory",
-  publisher =    "Addison-Wesley",
-  address =      "Redwood City, CA",
-  year =         "1988",
-}
-
-@Article{Park-nc91,
-  author =       "J. Park and I. W. Sandberg",
-  title =        "Universal Approximation Using Radial-Basis-Function
-                 Networks",
-  journal =      nc,
-  volume =       "3",
-  number =       "2",
-  pages =        "246--257",
-  year =         "1991",
-}
-
-@TechReport{Parker85,
-  author =       "D. B. Parker",
-  title =        "Learning Logic",
-  number =       "TR--47",
-  institution =  "Center for Computational Research in Economics and
-                 Management Science, Massachusetts Institute of
-                 Technology",
-  address =      "Cambridge, MA",
-  year =         "1985",
-}
-
-@InProceedings{Parker87,
-  author =       "D. B. Parker",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Optimal Algorithms for Adaptive Networks: Second Order
-                 Back Propagation, Second Order Direct Propagation, and
-                 Second Order Hebbian Learning",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "593--600",
-  year =         "1987",
-}
-
-@InProceedings{Parks87,
-  author =       "M. Parks",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Characterization of the {Boltzmann} Machine Learning
-                 Rate",
-  volume =       "3",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "715--719",
-  year =         "1987",
-}
-
-@Article{Parlos94,
-  author =       "A. G. Parlos and J. Muthusami and A. F. Atiya",
-  title =        "Incipient Fault Detection and Identification in
-                 Process Systems using Accelerated Neural Network
-                 Learning",
-  journal =      "Nuclear Technology",
-  volume =       "105",
-  pages =        "145",
-  year =         "1994",
-}
-
-@Article{Parzen62,
-  author =       "Emanuel Parzen",
-  title =        "On the estimation of a probability density function
-                 and mode",
-  journal =      "Annals of Mathematical Statistics",
-  volume =       "33",
-  pages =        "1064--1076",
-  year =         "1962",
-}
-
-@InProceedings{pati93orthogonal,
-  author =       "Y. Pati and R. Rezaiifar and P. Krishnaprasad",
-  booktitle =    "Proceedings of the 27 th Annual Asilomar Conference on
-                 Signals, Systems, and Computers",
-  title =        "Orthogonal Matching Pursuit: Recursive Function
-                 Approximation with Applications to Wavelet
-                 Decomposition",
-  pages =        "40--44",
-  month =        nov,
-  year =         "1993",
-}
-
-@InProceedings{Paugam-Moisy-1992,
-  author =       "H\'el\`ene {Paugam-Moisy}",
-  booktitle =    ijcnn,
-  title =        "On the Convergence of a Block-Gradient Algorithm for
-                 Back-Propagation Learning",
-  volume =       "3",
-  publisher =    "IEEE",
-  address =      "New York",
-  pages =        "919--924",
-  year =         "1992",
-}
-
-@InProceedings{Paugam-Moisy-1992b,
-  author =       "H\'{e}l\`{e}ne {Paugam-Moisy}",
-  booktitle =    "CONPAR '92/ VAPP V: Proceedings of the Second Joint
-                 International Conference on Vector and Parallel
-                 Processing",
-  title =        "Optimal Speedup Conditions for a Parallel
-                 Back-Propagation Algorithm",
-  publisher =    "Springer-Verlag",
-  address =      "London, UK",
-  pages =        "719--724",
-  year =         "1992",
-  ISBN =         "3-540-55895-0",
-}
-
-@InCollection{Paugam-Moisy-1993,
-  author =       "H\'el\`ene {Paugam-Moisy}",
-  editor =       "I. Pitas",
-  booktitle =    "Parallel Algorithms for Digital Image Processing,
-                 Computer Vision and Neural Networks",
-  title =        "Parallel Neural Computing Based on Network
-                 Duplicating",
-  publisher =    "John Wiley",
-  pages =        "305--340",
-  year =         "1993",
-}
-
-@inproceedings{Pavlovic-2001,
- author = {Vladimir Pavlovic and James M. Rehg and John MacCormick},
- title = {Learning Switching Linear Models of Human Motion},
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  publisher =    "{MIT} Press",
-  pages =        "981--987",
-  year =         "2001",
-}
- 
-
-@Book{PdpManual,
-  author =       "D. E. Rumelhart and J. L. McClelland",
-  title =        "Exploration in Parallel Distributed Processing",
-  volume =       "3",
-  publisher =    "MIT Press",
-  year =         "1988",
-}
-
-@InProceedings{Pearl-Verma91,
-  author =       "J. Pearl and T. S. Verma",
-  editor =       "J. A. Allen and R. Fikes and and E. Sandewall",
-  booktitle =    "Principles of Knowledge Representation and Reasoning:
-                 Proceedings of the Second International Conference",
-  title =        "A theory of inferred causation",
-  publisher =    "Morgan Kaufmann, San Mateo, CA",
-  pages =        "441--452",
-  year =         "1991",
-}
-
-@Book{Pearl88,
-  author =       "Judea Pearl",
-  title =        "Probabilistic Reasoning in Intelligent Systems:
-                 Networks of Plausible Inference",
-  publisher =    "Morgan Kaufmann",
-  year =         "1988",
-}
-
-@InProceedings{Pearlmutter+Parra-96,
-  author =       "Barak Pearlmutter and L. C. Parra",
-  editor =       "L. Xu",
-  booktitle =    "International Conference On Neural Information
-                 Processing",
-  title =        "A context-sensitive generalization of {ICA}",
-  address =      "Hong-Kong",
-  pages =        {151--157},
-  year =         "1996",
-}
-
-@InProceedings{Pearlmutter86,
-  author =       "B. A. Pearlmutter and G. E. Hinton",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "{G}-Maximization: An Unsupervised Learning Procedure
-                 for Discovering Regularities",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "333--338",
-  year =         "1986",
-}
-
-@InProceedings{Pearlmutter89a,
-  author =       "B. A. Pearlmutter",
-  booktitle =    ijcnn,
-  title =        "Learning State Space Trajectories in Recurrent Neural
-                 Networks",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "365--372",
-  year =         "1989",
-}
-
-@Article{Pearlmutter89b,
-  author =       "B. A. Pearlmutter",
-  title =        "Learning State Space Trajectories in Recurrent Neural
-                 Networks",
-  journal =      nc,
-  volume =       "1",
-  pages =        "263--269",
-  year =         "1989",
-}
-
-@article{Pearson-1901,
-    author = {Pearson, K. },
-    citeulike-article-id = {2013414},
-    journal = {Philosophical Magazine},
-    keywords = {pca},
-    number = {6},
-    pages = {559--572},
-    posted-at = {2007-11-29 10:41:36},
-    priority = {2},
-    title = {On lines and planes of closest fit to systems of points in space},
-    volume = {2},
-    year = {1901}
-}
-
-@InProceedings{Pedersen2001,
-  author =       "Ted Pedersen",
-  booktitle =    "Proceedings of the Second Annual Meeting of the North
-                 American Chapter of the Association for Computational
-                 Linguistics",
-  title =        "A decision tree of bigrams is an accurate predictor of
-                 word sense",
-  pages =        "79--86",
-  year =         "2001",
-  URL =          "citeseer.nj.nec.com/pedersen01decision.html",
-}
-
-@InProceedings{Peeling86,
-  author =       "S. M. Peeling and R. K. Moore and M. J. Tomlinson",
-  booktitle =    "Proceedings of the 10th Autumn Conference on Speech
-                 and Hearing",
-  title =        "The Multi-Layer Perceptron as a Tool for Speech
-                 Pattern Processing Research",
-  year =         "1986",
-}
-
-@InProceedings{peng04accurate,
-  author =       "F. Peng and A. McCallum",
-  booktitle =    "Proceedings of Human Language Technology Conference /
-                 North American Chapter of the Association for
-                 Computational Linguistics annual meeting",
-  title =        "Accurate information extraction from research papers
-                 using conditional random fields",
-  pages =        "329--336",
-  year =         "2004",
-}
-
-@InProceedings{Pennacchiotti+Pantel-2006,
-  author =       "Marco Pennacchiotti and Patrick Pantel",
-  booktitle =    "Proceedings of the 21st International Conference on
-                 Computational Linguistics and 44th Annual Meeting of
-                 the ACL",
-  title =        "Ontologizing Semantic Relations",
-  address =      "Sydney",
-  pages =        "793--800",
-  year =         "2006",
-}
-
-@Article{Penrose55,
-  author =       "R. Penrose",
-  title =        "A generalized inverse for matrices",
-  journal =      "Proc. Cambridge Philos. Soc.",
-  volume =       "51",
-  pages =        "406--513",
-  year =         "1955",
-}
-
-@InProceedings{Pereira93,
-  author =       "F. Pereira and N. Tishby and L. Lee",
-  booktitle =    "30th Annual Meeting of the Association for
-                 Computational Linguistics",
-  title =        "Distributional Clustering of English Words",
-  address =      "Columbus, Ohio",
-  pages =        "183--190",
-  year =         "1993",
-}
-
-@InProceedings{Pereira94,
-  author =       "F. Pereira and M. Riley and R. Sproat",
-  booktitle =    "ARPA Natural Language Processing Workshop",
-  title =        "Weighted rational transductions and their application
-                 to human language processing",
-  year =         "1994",
-}
-
-@InCollection{Pereira97,
-  author =       "F. C. N. Pereira and M. D. Riley",
-  editor =       "Emmanuel Roche and Yves Schabes",
-  booktitle =    "Finite-State Language Processing",
-  title =        "Speech recognition by composition of weighted finite
-                 automata",
-  publisher =    "MIT Press, Cambridge, Massachussetts",
-  pages =        "431--453",
-  year =         "1997",
-}
-
-@Article{Peretto84,
-  author =       "P. Peretto",
-  title =        "Collective Properties of Neural Networks: {A}
-                 Statistical Physics Approach",
-  journal =      biocyb,
-  volume =       "50",
-  pages =        "51--62",
-  year =         "1984",
-}
-
-@InProceedings{Peretto86,
-  author =       "P. Peretto and J. J. Niez",
-  editor =       "E. Bienenstock and F. Fogelman-Souli\'e and G.
-                 Weisbuch",
-  booktitle =    "Disordered Systems and Biological Organization",
-  title =        "Collective Properties of Neural Networks",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Les Houches 1985",
-  pages =        "171--185",
-  year =         "1986",
-}
-
-@Article{Peretto88,
-  author =       "P. Peretto",
-  title =        "On Learning Rules and Memory Storage Abilities of
-                 Asymmetrical Neural Networks",
-  journal =      jpp,
-  volume =       "49",
-  pages =        "711--726",
-  year =         "1988",
-}
-
-@InProceedings{Perez+Rendell-1996,
-  author =       "Eduardo P\'erez and Larry A. Rendell",
-  booktitle =    ICML96,
-  editor =       ICML96ed,
-  publisher =    ICML96publ,
-  title =        "Learning Despite Concept Variation by Finding
-                 Structure in Attribute-based Data",
-  pages =        "391--399",
-  year =         "1996",
-}
-
-@Article{Perez75,
-  author =       "R. P\'erez and L. Glass and R. Shlaer",
-  title =        "Development of Specificity in the Cat Visual Cortex",
-  journal =      jmathb,
-  volume =       "1",
-  pages =        "275--288",
-  year =         "1975",
-}
-
-@MISC{Perez98markovrandom,
-  author = {Patrick Perez},
-  title = {Markov Random Fields and Images},
-  year = {1998}
-}
-
-@article{PerpinanM2000,
- author = {Miguel \'{A}. Carreira-Perpi{\~{n}}\'{a}n and Steve \'{A}. Renals},
- title = {Practical Identifiability of Finite Mixtures of Multivariate Bernoulli Distributions},
- journal = {Neural Computation},
- volume = {12},
- number = {1},
- year = {2000},
- pages = {141--152},
- publisher = {MIT Press},
- address = {Cambridge, MA, USA},
- }
-
-@InProceedings{Perpinan+Hinton-2005,
-  author =       "Miguel A. Carreira-Perpi{\~{n}}an and Geoffrey E. Hinton",
-  editor =       aistats05ed,
-  booktitle =    aistats05,
-  title =        "On Contrastive Divergence Learning",
-  publisher =    "Society for Artificial Intelligence and Statistics",
-  date =         "Jan 6-8, 2005",
-  location =     "Savannah Hotel, Barbados",
-  pages =        "33--40",
-  year =         "2005",
-}
-
-@Article{Personnaz85,
-  author =       "L. Personnaz and I. Guyon and G. Dreyfus",
-  title =        "Information Storage and Retrieval in Spin-Glass-Like
-                 Neural Networks",
-  journal =      jppl,
-  volume =       "46",
-  pages =        "359--365",
-  year =         "1985",
-}
-
-@Article{Personnaz86,
-  author =       "L. Personnaz and I. Guyon and G. Dreyfus",
-  title =        "Collective Computational Properties of Neural
-                 Networks: New Learning Mechanisms",
-  journal =      prA,
-  volume =       "34",
-  pages =        "4217--4228",
-  year =         "1986",
-}
-
-@Article{Peterson2004,
-  author =       "Gail B. Peterson",
-  title =        "A day of great illumination: {B. F.} {Skinner}'s
-                 discovery of shaping",
-  journal =      "Journal of the Experimental Analysis of Behavior",
-  volume =       "82",
-  number =       "3",
-  pages =        "317--328",
-  year =         "2004",
-}
-
-@Article{Peterson87,
-  author =       "C. Peterson and J. R. Anderson",
-  title =        "A Mean Field Theory Learning Algorithm for Neural
-                 Networks",
-  journal =      cs,
-  volume =       "1",
-  pages =        "995--1019",
-  year =         "1987",
-}
-
-@Article{Peterson89,
-  author =       "C. Peterson and B. S{\"o}derberg",
-  title =        "A New Method for Mapping Optimization Problems onto
-                 Neural Networks",
-  journal =      ijns,
-  volume =       "1",
-  pages =        "3--22",
-  year =         "1989",
-}
-
-@Article{Peterson90,
-  author =       "C. Peterson and S. Redfield and J. D. Keeler and E.
-                 Hartman",
-  title =        "An Optoelectronic Architecture for Multilayer Learning
-                 in a Single Photorefractive Crystal",
-  journal =      nc,
-  volume =       "2",
-  pages =        "25--34",
-  year =         "1990",
-}
-
-@PhdThesis{PhD:Perrone,
-  author =       "Michael P. Perrone",
-  title =        "Improving Regression Estimation: Averaging Methods for
-                 Variance Reduction with Extensions to General Conve
-                 Measure Optimization",
-  school =       "Brown University, Institute for Brain and Neural
-                 Systems",
-  month =        may,
-  year =         "1993",
-}
-
-@Book{Piaget1952,
-  author =       "J.-P. Piaget",
-  title =        "The origins of intelligence in children",
-  publisher =    "International Universities Press",
-  address =      "New York",
-  year =         "1952",
-}
-
-@Article{Pineda87,
-  author =       "F. J. Pineda",
-  title =        "Generalization of Back-Propagation to Recurrent Neural
-                 Networks",
-  journal =      prl,
-  volume =       "59",
-  pages =        "2229--2232",
-  year =         "1987",
-}
-
-@Article{Pineda88,
-  author =       "F. J. Pineda",
-  title =        "Dynamics and Architecture for Neural Computation",
-  journal =      jcomp,
-  volume =       "4",
-  pages =        "216--245",
-  year =         "1988",
-}
-
-@InProceedings{Pineda88-nips,
-  author =       "F. Pineda",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Generalization of Backpropagation to Recurrent and
-                 Higher Order Neural Networks",
-  organization = "American Institute of Physics",
-  address =      "New York, NY",
-  pages =        "602--611",
-  year =         "1988",
-}
-
-@Article{Pineda89,
-  author =       "F. J. Pineda",
-  title =        "Recurrent Back-Propagation and the Dynamical Approach
-                 to Adaptive Neural Computation",
-  journal =      nc,
-  volume =       "1",
-  pages =        "161--172",
-  year =         "1989",
-}
-
-@InCollection{PINN,
-  author =       "P. Frasconi and M. Gori and A. Tesi",
-  editor =       "O. Omidvar",
-  booktitle =    "Progress in Neural Networks",
-  title =        "Successes and Failures of Backpropagation: {A}
-                 Theoretical Investigation",
-  volume =       "5",
-  publisher =    "Ablex Publishing",
-  year =         "1993",
-}
-
-@article{Pinto08,
-  author = {Pinto, Nicolas AND Cox, David D AND DiCarlo, James J},
-  journal = {PLoS Comput Biol},
-  publisher = {Public Library of Science},
-  title = {Why is Real-World Visual Object Recognition Hard?},
-  year = {2008},
-  month = {01},
-  volume = {4},
-}        
-
-@inproceedings{Pinto-DiCarlo-2008,
- author = {Nicolas Pinto and James {DiCarlo} and David Cox},
- title = {Establishing Good Benchmarks and Baselines for Face Recognition},
- booktitle = {ECCV 2008 Faces in 'Real-Life' Images Workshop},
- year = 2008,
-address={{M}arseille {F}rance },
-organization={{E}rik {L}earned-{M}iller and {A}ndras {F}erencz and {F}r{\'e}d{\'e}ric {J}urie },
-audience={internationale },
-URL={http://hal.inria.fr/inria-00326732/en/},
-}
-
-@article{Pinto-2009,
-  author = {Pinto, Nicolas AND Doukhan, David AND DiCarlo, James J. AND Cox, David D.},
-  journal = {PLoS Comput Biol},
-  publisher = {Public Library of Science},
-  title = {A High-Throughput Screening Approach to Discovering Good Forms of Biologically Inspired Visual Representation},
-  year = {2009},
-  month = {11},
-  volume = {5},
-  pages = {e1000579},
-  number = {11},
-}        
-
-@InCollection{Platt2000,
-  author =       "J. Platt",
-  editor =       "A. Smola and P. Bartlett and B. Scholkopf and D.
-                 Schuurmans",
-  booktitle =    "Advances in Large Margin Classifiers",
-  title =        "Probabilities for support vector machines",
-  publisher =    "MIT press",
-  year =         "2000",
-}
-
-@Article{Platt91,
-  author =       "J. Platt",
-  title =        "A Resource-Allocating Network for Function
-                 Interpolation",
-  journal =      "Neural Computation",
-  volume =       "3",
-  type =         "Letter",
-  number =       "2",
-  pages =        "213--225",
-  year =         "1991",
-}
-
-@InProceedings{Platt94,
-  author =       "R. Wolf and J. Platt",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Postal address block location using a convolutional
-                 locator network",
-  pages =        "745--752",
-  year =         "1994",
-}
-
-@Article{Plaut-csl87,
-  author =       "D. C. Plaut and G. E. Hinton",
-  title =        "Learning Set of Filters Using Back-propagation",
-  journal =      cspla,
-  volume =       "2",
-  pages =        "35--61",
-  year =         "1987",
-}
-
-@TechReport{Plaut86,
-  author =       "D. Plaut and S. Nowlan and G. Hinton",
-  title =        "Experiments on Learning by Back-Propagation",
-  number =       "CMU--CS--86--126",
-  institution =  "Department of Computer Science, Carnegie Mellon
-                 University",
-  address =      "Pittsburgh, PA",
-  year =         "1986",
-}
-
-@Article{PLS-Frank-Friedman,
-  author =       "Ildiko E. Frank and Jerome H. Friedman",
-  title =        "A statistical view of some chemometrics regression
-                 tools",
-  journal =      "Technometrics",
-  volume =       "35",
-  number =       "2",
-  pages =        "109--148",
-  year =         "1993",
-}
-
-@Article{Podder-2006,
-  author =       "M. Podder and W. J. Welch and R. H. Zamar and S. J. S.
-                 J. Tebbutt",
-  title =        "Dynamic Variable Selection in {SNP} Genotype
-                 Autocalling from {APEX} Microarray Data",
-  journal =      "In revision for BMC Bioinformatics",
-  year =         "2006",
-}
-
-@Article{Poggio-ieee90,
-  author =       "T. Poggio and F. Girosi",
-  title =        "Networks for Approximation and Learning",
-  journal =      ieeeproc,
-  volume =       "78",
-  number =       "9",
-  pages =        "1481--1497",
-  year =         "1990",
-}
-
-@Article{Poggio75,
-  author =       "T. Poggio",
-  title =        "On Optimal NonLinear Associative Recall",
-  journal =      biocyb,
-  volume =       "19",
-  pages =        "201",
-  year =         "1975",
-}
-
-@Article{Poggio85,
-  author =       "T. Poggio and V. Torre and C. Koch",
-  title =        "Computational Vision and Regularization Theory",
-  journal =      "Nature",
-  volume =       "317",
-  number =       "26",
-  pages =        "314--319",
-  year =         "1985",
-}
-
-@TechReport{Poggio89,
-  author =       "T. Poggio and F. Girosi",
-  title =        "A theory of networks for approximation and learning",
-  number =       "1140",
-  institution =  "MIT AI Laboratory",
-  address =      "Cambridge, MA",
-  year =         "1989",
-}
-
-@Article{Poggio90,
-  author =       "T. Poggio and F. Girosi",
-  title =        "Regularization Algorithms for Learning That Are
-                 Equivalent to Multilayer Networks",
-  journal =      science,
-  volume =       "247",
-  pages =        "978--982",
-  year =         "1990",
-}
-
-@Article{Pollack90,
-  author =       "Jordan B. Pollack",
-  title =        "Recursive Distributed Representations",
-  journal =      "Artificial Intelligence",
-  volume =       "46",
-  number =       "1",
-  pages =        "77--105",
-  year =         "1990",
-}
-
-@Article{Pollack91,
-  author =       "Jordan B. Pollack",
-  title =        "The Induction of Dynamical Recognizers",
-  journal =      mlearn,
-  volume =       "7",
-  number =       "2",
-  pages =        "196--227",
-  year =         "1991",
-}
-
-@Book{Pollard84,
-  author =       "D. Pollard",
-  title =        "Convergence of stochastic processes",
-  publisher =    "Springer-Verlag",
-  address =      "New York, NY",
-  year =         "1984",
-}
-
-@InProceedings{Pollit91,
-  author =       "M. D. Pollit and J. Peck",
-  booktitle =    "Proc. 2nd Canadian Conf. on Computer Applications in
-                 the Mineral Industry",
-  title =        "Recent advances in lithological recognition based on
-                 rotary blasthole drill responses",
-  address =      "Vancouver, Canada",
-  year =         "1991",
-}
-
-@InProceedings{Pomerleau89,
-  author =       "D. A. Pomerleau",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "{ALVINN}: An Autonomous Land Vehicle in a Neural
-                 Network",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "305--313",
-  year =         "1989",
-}
-
-@TechReport{Pontil98,
-  author =       "M. Pontil and A. Verri",
-  title =        "Properties of Support Vector Machines",
-  number =       "AI Memo 1612",
-  institution =  "MIT",
-  year =         "1998",
-}
-
-@InProceedings{Poritz88,
-  author =       "A. B. Poritz",
-  booktitle =    "Proc. Int. Conf. Acoustics, Speech, and Signal
-                 Processing",
-  title =        "Hidden {Markov} models: a guided tour",
-  pages =        "7--13",
-  year =         "1988",
-}
-
-@InProceedings{Poston,
-  author =       "T. Poston and C. Lee and Y. Choie and Y. Kwon",
-  booktitle =    "Proc. of the IEEE-IJCNN91",
-  title =        "Local minima and Backpropagation",
-  address =      "Seattle, WA",
-  pages =        "173--176",
-  year =         "1991",
-}
-
-@InProceedings{Poston-ijcnn91,
-  author =       "T. Poston and C. Lee and Y. Choie and Y. Kwon",
-  booktitle =    ijcnn,
-  title =        "Local Minima and Backpropagation",
-  publisher =    "IEEE Press",
-  address =      "Seattle WA",
-  pages =        "173--176",
-  year =         "1991",
-}
-
-@Article{Poterba+Summers,
-  author =       "J. M. Poterba and L. H. Summers",
-  title =        "Mean Reversion in Stock Prices",
-  journal =      "Journal of Financial Economics",
-  volume =       "22",
-  pages =        "27--59",
-  year =         "1988",
-}
-
-@Article{potvin:1995:orsajc,
-  author =       "J.-Y. Potvin and S. Bengio",
-  title =        "The Vehicle Routing Problem with Time Windows - Part
-                 {II}: Genetic Search",
-  journal =      "{ORSA} Journal on Computing",
-  year =         "1995",
-}
-
-@Misc{powell87radial,
-  author =       "M. Powell",
-  title =        "Radial basis functions for multivariable
-                 interpolation: {A} review",
-  year =         "1987",
-  text =         "M. J. D. Powell. Radial basis functions for
-                 multivariable interpolation: A review. In J. C. Mason
-                 and M. G. Cox, editors, Algorithms for Approximation of
-                 Functions and Data, pages 143--167. Oxford University
-                 Press, 1987.",
-}
-
-@InProceedings{Pratt+Kamm91,
-  author =       "L. Y. Pratt and C. A. Kamm",
-  booktitle =    ijcnn,
-  title =        "Improving a phoneme classification neural network
-                 through problem decomposition",
-  volume =       "2",
-  publisher =    "IEEE Press",
-  address =      "Seattle WA",
-  pages =        "821--826",
-  year =         "1991",
-}
-
-@InProceedings{pratt93,
-  author =       "Lorien Y. Pratt",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Discriminability-Based Transfer between Neural
-                 Networks",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "204--211",
-  year =         "1993",
-}
-
-@Article{Presnell93,
-  author =       "S. R. Presnell and F. E. Cohen",
-  title =        "Artificial neural networks for pattern recognition in
-                 biochemical sequences",
-  journal =      "Ann. Rev. Biophys. Biomol. Struct.",
-  volume =       "22",
-  pages =        "283--298",
-  year =         "1993",
-}
-
-@Book{Press86,
-  author =       "W. H. Press and B. P. Flannery and S. A. Teukolsky and
-                 W. T. Vetterling",
-  title =        "Numerical Recipes",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge",
-  year =         "1986",
-}
-
-@Book{Press92,
-  author =       "W. H. Press and S. A. Teukolsky and W. T. Vetterling
-                 and B. P. Flannery",
-  title =        "Numerical Recipes in {C}: The art of scientific
-                 computing (2nd ed.)",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge",
-  year =         "1992",
-}
-
-@article{Priebe2005,
- author = {C.E. Priebe and J.M. Conroy and D.J. Marchette and Y. park},
- title = {Scan Statistics on Enron Graphs},
- journal = {Computational and Mathematical Organization Theory},
- volume = 11,
- number = 3,
- pages = {229--247},
- month = {October},
- year = 2005,
- publisher = {Springer},
-}
-
-@Book{Priestley81,
-  author =       "M. B. Priestley",
-  title =        "Spectral Analysis and Time Series, Vol.1: Univariate
-                 Series",
-  publisher =    "Academic Press",
-  year =         "1981",
-}
-
-@Article{Principe92,
-  author =       "B. {de Vries} and J. C. Principe",
-  title =        "The gamma model -- {A} new neural net model for
-                 temporal processing",
-  journal =      nn,
-  volume =       "5",
-  pages =        "565--576",
-  year =         "1992",
-  OPTnote =      "",
-}
-
-@Article{Psa88a,
-  author =       "D. Psaltis and C. H. Park and J. Hong",
-  title =        "Higher Order Associative Memories and Their Optical
-                 Implementations",
-  journal =      "Neural Networks",
-  volume =       "1",
-  number =       "2",
-  pages =        "149--163",
-  year =         "1988",
-}
-
-@InProceedings{Psaltis89,
-  author =       "D. Psaltis and D. Brady and K. Hsu",
-  booktitle =    ijcnn,
-  title =        "Learning in optical neural computers",
-  volume =       "2",
-  address =      "Washington D.C.",
-  pages =        "72--75",
-  year =         "1989",
-}
-
-@TechReport{publication-an,
-  author =       "Tomaso Poggio and Frederico Girosi",
-  title =        "An Equivalence Between Sparse Approximation and
-                 Support Vector Machines",
-}
-
-@TechReport{publication-notes,
-  author =       "Tomaso Poggio and Frederico Girosi",
-  title =        "Notes on {PCA}, Regularization, Sparsity and Support
-                 Vector Machines",
-}
-
-@Article{Qian+Sejnowski88,
-  author =       "N. Qian and T. J. Sejnowski",
-  title =        "Predicting the secondary structure of globular
-                 proteins using neural network models",
-  journal =      "Journal of Molecular Biology",
-  volume =       "202",
-  pages =        "865--884",
-  year =         "1988",
-}
-
-@Article{Qian88a,
-  author =       "N. Qian and T. J. Sejnowski",
-  title =        "Predicting the Secondary Structure of Globular
-                 Proteins Using Neural Network Models",
-  journal =      jmolecb,
-  volume =       "202",
-  pages =        "865--884",
-  year =         "1988",
-}
-
-@InProceedings{Qian88b,
-  author =       "N. Qian and T. J. Sejnowski",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "Learning to Solve Random-Dot Stereograms of Dense
-                 Transparent Surfaces with Recurrent Back-Propagation",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "435--443",
-  year =         "1988",
-}
-
-@Article{quantiles-nc-2002,
-  author =       "Ichiro Takeuchi and Yoshua Bengio and Takafumi
-                 Kanamori",
-  title =        "Robust Regression with Asymmetric Heavy-Tail Noise Distributions",
-  journal =      "Neural Computation",
-  volume =       "14",
-  number =       "10",
-  pages =        "2469--2496",
-  year =         "2002",
-}
-
-@TechReport{quantiles-TR,
-  author =       "Ichiro Takeuchi and Yoshua Bengio and Takafumi
-                 Kanamori",
-  title =        "Robust Regression with Asymmetric Heavy-Tail Noise",
-  number =       "1198",
-  institution =  "Dept. IRO, Universit\'e de Montr\'eal",
-  year =         "2001",
-}
-
-@Article{Quinlan+Rivest89,
-  author =       "J. Ross Quinlan and Ronald L. Rivest",
-  title =        "Inferring Decision Trees Using the Minimum Description
-                 Length Principle",
-  journal =      "Information and Computation",
-  volume =       "80",
-  pages =        "227--248",
-  year =         "1989",
-}
-
-@Article{Quinlan86,
-  author =       "J. Ross Quinlan",
-  title =        "Induction of Decision Trees",
-  journal =      "Machine Learning",
-  volume =       "1",
-  number =       "1",
-  pages =        "81--106",
-  year =         "1986",
-}
-
-@Book{Quinlan93,
-  author =       "J. Ross Quinlan",
-  title =        "{C4}.5: Programs for Machine Learning",
-  publisher =    "Morgan Kaufmann",
-  year =         "1993",
-}
-
-@Book{Rabiner+Gold75,
-  author =       "L. R. Rabiner and B. Gold",
-  title =        "Theory and application of digital signal processing",
-  publisher =    "Prentice-Hall",
-  year =         "1975",
-}
-
-@Article{Rabiner85,
-  author =       "L. R. Rabiner and S. E. Levinson",
-  title =        "A speaker-independent, syntax-directed, connected word
-                 recognition system based on hidden {Markov} models and
-                 level building",
-  journal =      ieeetassp,
-  volume =       "33",
-  number =       "3",
-  pages =        "561--573",
-  year =         "1985",
-}
-
-@Article{Rabiner86,
-  author =       "L. R. Rabiner and B. H. Juang",
-  title =        "An Introduction to Hidden {Markov} Models",
-  journal =      ieeeassp,
-  pages =        "257--285",
-  month =        "jan",
-  year =         "1986",
-}
-
-@Article{Rabiner89,
-  author =       "La. R. Rabiner",
-  title =        "A Tutorial on Hidden {Markov} Models and Selected
-                 Applications in Speech Recognition",
-  journal =      "Proceedings of the IEEE",
-  volume =       "77",
-  number =       "2",
-  pages =        "257--286",
-  year =         "1989",
-  OPTannote =    "",
-}
-
-@Article{Raetsch-2002,
-  author =       "Gunnar R{\"a}tsch and Ayhan Demiriz and Kristin P. Bennett",
-  title =        "Sparse Regression Ensembles in Infinite and Finite
-                 Hypothesis Spaces",
-  journal =      "Machine Learning",
-  publisher =    "Kluwer Academic Publishers",
-  year =         "2002",
-}
-
-@InCollection{Raftery1996,
-  author =       "A. Raftery",
-  editor =       "Gilks and al.",
-  booktitle =    "MCMC in Practice",
-  title =        "Hypothesis Testing and Model Selection",
-  publisher =    "Chapman and Hall",
-  pages =        "163--188",
-  year =         "1996",
-}
-
-
-@inproceedings{RaginskyM2008,
-  author    = {Maxim Raginsky and
-               Svetlana Lazebnik and
-               Rebecca Willett and
-               Jorge Silva},
-  title     = {Near-minimax recursive density estimation on the binary
-               hypercube},
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  year      = {2008},
-  pages     = {1305-1312},
-}
-
-@INPROCEEDINGS{RainaR2003,
-    author = {Rajat Raina and Yirong Shen and Andrew Y. Ng and Andrew McCallum},
-    title = {Classification with hybrid generative/discriminative models},
-    editor = NIPS16ed,
-    booktitle = NIPS16,
-    year = {2003},
-    publisher = {MIT Press}
-}
-
-@Misc{raina+ng+koller-workshop-2005,
-  author =       "Rajat Raina and Andrew Y. Ng and Daphne Koller",
-  title =        "Transfer Learning by Constructing Informative Priors",
-  howpublished = "'Inductive Transfer: 10 Years Later' NIPS Workshop",
-  year =         "2005",
-  OPTkey =       "",
-}
-
-@InProceedings{RainaR2007,
-  author =       "Rajat Raina and Alexis Battle and Honglak Lee and
-                 Benjamin Packer and Andrew Y. Ng",
-  booktitle =    ICML07,
-  editor =       ICML07ed,
-  publisher =    ICML07publ,
-  title =        "Self-taught learning: transfer learning from unlabeled
-                 data",
-  pages =        "759--766",
-  year =         "2007",
-  bibsource =    "DBLP, http://dblp.uni-trier.de",
-  ee =           "http://doi.acm.org/10.1145/1273496.1273592",
-}
-
-@InProceedings{RainaR2007-small,
-  author =       "R. Raina and A. Battle and H. Lee and B. Packer and A.
-                 Y. Ng",
-  booktitle =    "ICML 2007",
-  title =        "Self-taught learning: transfer learning from unlabeled
-                 data",
-  year =         "2007",
-}
-
-@inproceedings{RainaICML09,
-  author = {Raina, Rajat and Madhavan, Anand and Ng, Andrew Y.},
-  title = {Large-scale deep unsupervised learning using graphics processors},
-  booktitle = ICML09,
-  editor =  ICML09ed,
-  publisher = ICML09publ,
-  year = {2009},
-  isbn = {978-1-60558-516-1},
-  pages = {873--880},
-  location = {Montreal, Quebec, Canada},
-  doi = {http://doi.acm.org/10.1145/1553374.1553486},
-  address = {New York, NY, USA},
-}
-
-@InProceedings{Ramanujam88,
-  author =       "J. Ramanujam and P. Sadayappan",
-  booktitle =    icnn,
-  title =        "Optimization by Neural Networks",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "325--332",
-  year =         "1988",
-}
-
-@InProceedings{ranzato-07,
-  author =       "{Marc'Aurelio} Ranzato and Christopher Poultney and
-                 Sumit Chopra and Yann {LeCun}",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Efficient Learning of Sparse Representations with an
-                 Energy-Based Model",
-  publisher =    "MIT Press",
-  pages = {1137--1144},
-  year =         "2007",
-}
-
-@InProceedings{ranzato-07-small,
-  author =       "M. Ranzato and C. Poultney and
-                 S. Chopra and Y. {LeCun}",
-  booktitle =    "NIPS 19",
-  title =        "Efficient Learning of Sparse Representations with an
-                 Energy-Based Model",
-  year =         "2007",
-}
-
-@InProceedings{ranzato-07-short,
-  author =       "M. Ranzato and C. Poultney and
-                 S. Chopra and Y. {LeCun}",
-  booktitle =    "Adv. Neural Inf. Proc. Sys. 19",
-  title =        "Efficient Learning of Sparse Representations with an
-                 Energy-Based Model",
-  pages = {1137--1144},
-  year =         "2007",
-}
-
-# Please do NOT use this citation as it is a duplicate of ranzato-07
-@InCollection{ranzato-06,
-  author =       "{Marc'Aurelio} Ranzato and Christopher Poultney and
-                 Sumit Chopra and Yann {LeCun}",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Efficient Learning of Sparse Representations with an
-                 Energy-Based Model",
-  publisher =    "{MIT} Press",
-  pages =        "",
-  year =         "2007",
-}
-
-# Please do NOT use this citation as it is a duplicate of ranzato-07-small
-@InCollection{ranzato-06-small,
-  author =       "M. Ranzato and C. Poultney and
-                 S. Chopra and Y. {LeCun}",
-  booktitle =    "NIPS 19",
-  title =        "Efficient Learning of Sparse Representations with an
-                 Energy-Based Model",
-  year =         "2007",
-}
-
-
-@InProceedings{ranzato-08,
-  author =       "{Marc'Aurelio} Ranzato and Y-Lan Boureau and Yann
-                 {LeCun}",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Sparse feature learning for deep belief networks",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "1185--1192",
-  year =         "2008",
-}
-  %url =          "http://www.cs.nyu.edu/~ranzato/publications/ranzato-nips07.pdf",
-
-@InProceedings{ranzato-08-small,
-  author =       "M. Ranzato and Y. Boureau and Y. {LeCun}",
-  booktitle =    "NIPS 20",
-  title =        "Sparse feature learning for deep belief networks",
-  year =         "2008",
-}
-
-@InProceedings{ranzato-08-short,
-  author =       "M. Ranzato and Y. Boureau and Y. {LeCun}",
-  booktitle =    "Adv. Neural Inf. Proc. Sys. 20",
-  title =        "Sparse feature learning for deep belief networks",
-  year =         "2008",
-  pages = {1185--1192},
-}
-
-@InProceedings{ranzato-cvpr-07,
-  author =       "{Marc'Aurelio} Ranzato and {Fu-Jie} Huang and {Y-Lan}
-                 Boureau and Yann {LeCun}",
-  booktitle =    cvpr07,
-  title =        "Unsupervised Learning of Invariant Feature Hierarchies
-                 with Applications to Object Recognition",
-  publisher =    "IEEE Press",
-  year =         "2007",
-  original =     "orig/ranzato-cvpr-07.pdf",
-}
-
-@InProceedings{ranzato-cvpr-07-small,
-  author =       "{Marc'Aurelio} Ranzato and {Fu-Jie} Huang and {Y-Lan}
-                 Boureau and Yann {LeCun}",
-  booktitle =    "CVPR'07",
-  title =        "Unsupervised Learning of Invariant Feature Hierarchies
-                 with Applications to Object Recognition",
-  year =         "2007",
-  original =     "orig/ranzato-cvpr-07.pdf",
-}
-
-@InProceedings{Ranzato-icdar07,
-  author =       "{Marc'Aurelio} Ranzato and Yann {LeCun}",
-  booktitle =    ICDAR07,
-  title =        "A Sparse and Locally Shift Invariant Feature Extractor
-                 Applied to Document Images",
-  year =         "2007",
-  isbn =         {0-7695-2822-8},
-  pages =        {1213--1217},
-  publisher =    {IEEE Computer Society},
-  address =      {Washington, DC, USA},
-
-}
-
-@InProceedings{ranzato-unsup-07,
-  author =       "{Marc'Aurelio} Ranzato and {Y-Lan} Boureau and Sumit
-                 Chopra and Yann {LeCun}",
-  booktitle =    aistats07,
-  title =        "A Unified Energy-Based Framework for Unsupervised
-                 Learning",
-  publisher =    "Omnipress",
-  date =         "March 21-24, 2007",
-  address =      "San Juan, Porto Rico",
-  year =         "2007",
-}
-
-@InProceedings{Rao+Ruderman-99,
-  author =       "R. P. N. Rao and D. L. Ruderman",
-  editor =       NIPS11ed,
-  booktitle =    NIPS11,
-  title =        "Learning {Lie} Groups for Invariant Visual
-                 Perception",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "810--816",
-  year =         "1999",
-}
-
-@Book{Rao71,
-  author =       "C. R. Rao and S. K. Mitra",
-  title =        "Generalized Inverse of Matrices and Its Applications",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1971",
-}
-
-@Book{Rashevsky38,
-  author =       "N. Rashevsky",
-  title =        "Mathematical Biophysics",
-  publisher =    "University of Chicago Press",
-  address =      "Chicago",
-  year =         "1938",
-}
-
-@InProceedings{RasmussenC2000,
-  author =       "Carl Rasmussen",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "The Infinite {G}aussian Mixture Model",
-  year =         "2000",
-}
-
-@Misc{Rasmussen2001,
-  author =       "Carl Edward Rasmussen",
-  title =        "Conjugate gradient for Matlab",
-  year =         "2001",
-  note =         "http://www.kyb.tuebingen.mpg.de/bs/people/carl/code/minimize/",
-}
-
-@Article{Ratnaparkhi99,
-  author =       "A. Ratnaparkhi",
-  title =        "Learning to parse natural language with maximum
-                 entropy models",
-  journal =      "Machine Learning",
-  volume =       "341",
-  number =       "2",
-  pages =        "151--176",
-  year =         "1999",
-}
-
-@Article{Rauch63,
-  author =       "H. E. Rauch",
-  title =        "Solutions to the linear smoothing problem",
-  journal =      "IEEE Transactions on Automatic Control",
-  volume =       "8",
-  pages =        "371--372",
-  year =         "1963",
-}
-
-@Article{Refenes-94,
-  author =       "A. N. Refenes",
-  title =        "Stock Performance Modeling Using Neural Networks: a
-                 Comparative Study with Regression Models",
-  journal =      "Neural Networks",
-  volume =       "7",
-  number =       "2",
-  pages =        "375--388",
-  year =         "1994",
-}
-
-@Article{regression-KB-78,
-  author =       "R. Koenker and G. Bassett Jr.",
-  title =        "Regression Quantiles",
-  journal =      "Econometrica",
-  volume =       "46",
-  number =       "1",
-  pages =        "33--50",
-  year =         "1978",
-}
-@inproceedings{reid:1989,
-    title = {Rapid Training of Higher-Order Neural Networks for Invariant Pattern
-        Recognition},
-    author = {Reid, M. B. and  Spirkovska, L. and  Ochoa, E  },
-    booktitle = ijcnn,
-    month   = {June},
-    year    = {1989},
-    address = {Washington, DC, USA},
-}
-
-@InCollection{Rescorla72,
-  author =       "R. A. Rescorla and A. R. Wagner",
-  editor =       "A. H. Black and W. F. Prokasy",
-  booktitle =    "Classical Conditioning II: Current Research and
-                 Theory",
-  title =        "A Theory of Pavlovian Conditioning: The Effectiveness
-                 of Reinforcement and Nonreinforcement",
-  publisher =    "Appleton-Century-Crofts",
-  address =      "New York",
-  pages =        "64--69",
-  year =         "1972",
-}
-
-@InProceedings{Resnik-2002,
-  author =       "Mona Diab and Philip Resnik",
-  booktitle =    "40th Annual Meeting of the {ACL}",
-  title =        "An unsupervised method for word sense tagging using
-                 parallel corpora",
-  year =         "2002",
-}
-
-@Article{Resnik-99,
-  author =       "Philip Resnik",
-  title =        "Semantic similarity in a taxonomy: an
-                 information-based measure and its application to
-                 problems of ambiguity in natural language",
-  journal =      "Journal of Artificial Intelligence Research",
-  volume =       "11",
-  pages =        "95--130",
-  year =         "1999",
-}
-
-@InProceedings{Resnik-99-web,
-  author =       "P. Resnik",
-  booktitle =    "37th Annual Meeting of the Association for
-                 Computational Linguistics (ACL'99)",
-  title =        "Mining the Web for Bilingual Text",
-  address =      "College Park, Maryland",
-  month =        jun,
-  year =         "1999",
-}
-
-@article{Rhodes-2008,
- author = {Paul Rhodes},
- title = {Recoding Patterns of Sensory Input: Higher-Order Features and the Function of Nonlinear Dendritic Trees},
- journal = {Neural Computation},
- volume = 20,
- number=8,
- pages = {2000--2036},
- year = 2008,
-}
-
-@Article{RicLip91,
-  author =       "Michael D. Richard and Richard P. Lippmann",
-  title =        "Neural Network Classifiers Estimate {Bayesian}
-                 a-posteriori Probabilities",
-  journal =      "Neural Computation",
-  volume =       "3",
-  pages =        "461--483",
-  year =         "1991",
-  abstract =     "Theoretical argumentation under which circumstances
-                 nets can estimate correctly and what this means for
-                 network engineering methodology. Experimental
-                 evaluations with different cost functions (mean squared
-                 error, cross entropy, normalized likelihood) and
-                 network types (multi layer perceptron, radial basis
-                 function, high order polynomial) show how accuracy
-                 degrades with insufficient data or inadequate network
-                 size. Dicusses practical consequences. Contains
-                 references to work on other cost functions (e.g.
-                 information measures)",
-  class =        "nn, learning, theory",
-}
-
-@InProceedings{Ricotti88,
-  author =       "L. P. Ricotti and S. Ragazzini and G. Martinelli",
-  booktitle =    icnn,
-  title =        "Learning of Word Stress in a Sub-Optimal Second Order
-                 Back-Propagation Neural Network",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "355--361",
-  year =         "1988",
-}
-
-@Article{Riedel88,
-  author =       "U. Riedel and R. K{\"u}hn and J. L. van Hemmen",
-  title =        "Temporal Sequences and Chaos in Neural Nets",
-  journal =      prA,
-  volume =       "38",
-  pages =        "1105--1108",
-  year =         "1988",
-}
-
-@Article{Riis96,
-  author =       "S. K. Riis and A. Krogh",
-  title =        "Improving prediction of protein secondary structure
-                 using structured neural networks and multiple sequence
-                 alignments",
-  journal =      "J. Comput. Biol.",
-  volume =       "3",
-  pages =        "163--183",
-  year =         "1996",
-}
-
-@Article{RiisKrogh1996,
-  author =       "S. Riis and A. Krogh",
-  title =        "Improving protein secondary structure prediction using
-                 structured neural networks and multiple sequence
-                 profiles",
-  journal =      "Journal of Computational Biology",
-  pages =        "163--183",
-  year =         "1996",
-}
-
-@TechReport{Riley94,
-  author =       "M. D. Riley and F. C. N. Pereira",
-  title =        "Weighted-finite-automata tools with applications to
-                 speech and language processing",
-  number =       "Technical Memorandum 11222-931130-28TM",
-  institution =  "AT\&T Bell Laboratories",
-  year =         "1994",
-}
-
-@article{Rissanen79, 
- author = {J.J. Rissanen and G.G. Langdon Jr.},
- title = {Arithmetic coding},
- journal = {IBM Journal of Research and Development},
- volume = 23, 
- number = 2,
- pages = {149--162},
- year = 1979,
-}
-
-@Article{rissanen83,
-  author =       "J.J. Rissanen",
-  title =        "A universal data compression system",
-  journal =      "IEEE Transactions on Information Theory",
-  volume =       "29",
-  pages =        "656--664",
-  year =         "1983",
-}
-
-@Article{Rissanen86,
-  author =       "J. Rissanen",
-  title =        "Stochastic complexity and modeling",
-  journal =      "Annals of Statistics",
-  volume =       "14",
-  pages =        "1080--1100",
-  year =         "1986",
-}
-
-@Book{RissanenBook,
-  author =       "J. Rissanen",
-  title =        "Stochastic Complexity in Statistical Inquiry",
-  publisher =    "World Scientific",
-  address =      "Singapore",
-  year =         "1990",
-}
-
-@Article{Ritter86,
-  author =       "H. Ritter and K. Schulten",
-  title =        "On the Stationary State of Kohonen's Self-Organizing
-                 Sensory Mapping",
-  journal =      biocyb,
-  volume =       "54",
-  pages =        "99--106",
-  year =         "1986",
-}
-
-@InProceedings{Ritter88a,
-  author =       "H. Ritter and K. Schulten",
-  editor =       "R. Eckmiller and Ch. von der Malsburg",
-  booktitle =    "Neural Computers",
-  title =        "Extending Kohonen's Self-Organizing Mapping Algorithm
-                 to Learn Ballistic Movements",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Neuss 1987",
-  pages =        "393--406",
-  year =         "1988",
-}
-
-@Article{Ritter88b,
-  author =       "H. Ritter and K. Schulten",
-  title =        "Convergence Properties of Kohonen's Topology
-                 Conserving Maps: Fluctuations, Stability, and Dimension
-                 Selection",
-  journal =      biocyb,
-  volume =       "60",
-  pages =        "59--71",
-  year =         "1988",
-}
-
-@InProceedings{Ritter88c,
-  author =       "H. Ritter and K. Schulten",
-  booktitle =    icnn,
-  title =        "Kohonen's Self-Organizing Maps: Exploring Their
-                 Computational Capabilities",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "109--116",
-  year =         "1988",
-}
-
-@Book{Robert-1999,
-  author =       "Christian P. Robert and George Casella",
-  title =        "Monte Carlo Statistical Methods",
-  publisher =    "Springer",
-  year =         "1999",
-}
-
-@TechReport{Robinson+Fallside90,
-  author =       "A. J. Robinson and F. Fallside",
-  key =          "Robinson",
-  title =        "Phoneme recognition from the {TIMIT} database using
-                 recurrent error propagation networks",
-  type =         "Technical Report",
-  number =       "{CUED/F-INFENG/TR.42}",
-  institution =  "Cambridge University Engineering Department",
-  year =         "1990",
-}
-
-@Article{Robinson+Fallside91,
-  author =       "A. J. Robinson and F. Fallside",
-  title =        "A recurrent error propagation network speech
-                 recognition system",
-  journal =      "Computer Speech and Language",
-  volume =       "5",
-  number =       "3",
-  pages =        "259--274",
-  month =        jul,
-  year =         "1991",
-}
-
-@InProceedings{Robinson88,
-  author =       "A. J. Robinson and F. Fallside",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Static and Dynamic Error Propagation Networks with
-                 Application to Speech Coding",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "632--641",
-  year =         "1988",
-}
-
-@Article{Robinson91,
-  author =       "T. Robinson and F. Fallside",
-  title =        "Recurrent Error Propagation Network Speech Recognition
-                 System",
-  journal =      cspla,
-  volume =       "5",
-  number =       "3",
-  pages =        "259--274",
-  month =        jul,
-  year =         "1991",
-}
-
-@InProceedings{Robinson92-icassp,
-  author =       "T. Robinson",
-  booktitle =    icassp,
-  title =        "A Real-Time Recurrent Error Propagation Network Word
-                 Recognition System",
-  volume =       "I",
-  pages =        "617--620",
-  year =         "1992",
-}
-
-@Article{robust-H-73,
-  author =       "P. J. Huber",
-  title =        "Robust regression: Asymptotics, Conjectures and
-                 {Monte} {Carlo}",
-  journal =      "Ann. Stat.",
-  volume =       "1",
-  pages =        "799--821",
-  year =         "1973",
-}
-
-@Book{robust-H-82,
-  author =       "P. J. Huber",
-  title =        "Robust Statistics",
-  publisher =    "John Wiley \& Sons Inc.",
-  year =         "1982",
-}
-
-@Book{robust-HRRS-86,
-  author =       "F. R. Hampel and E. M. Ronchetti and P. J. Rousseeuw
-                 and W. A. Stahel",
-  title =        "Robust Statistics, The Approach based on Influence
-                 Functions",
-  publisher =    "John Wiley \& Sons",
-  year =         "1986",
-}
-
-@TechReport{robust-RAD-00,
-  author =       "P. J. Rousseeuw and S. V. Aelst and K. V. Driessen",
-  title =        "Robust Multivariate Regression",
-  institution =  "University of Antwerp",
-  year =         "2000",
-}
-
-@Book{robust-RL-87,
-  author =       "P. J. Rousseeuw and A. M. Leroy",
-  title =        "Robust Regression and Outlier Detection",
-  publisher =    "John Wiley \& Sons Inc.",
-  year =         "1987",
-}
-
-@InProceedings{Rohwer-nips90,
-  author =       "R. Rohwer",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "The `Moving Targets' Training Algorithm",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "558--565",
-  year =         "1990",
-}
-
-@InProceedings{Rohwer87,
-  author =       "R. Rohwer and B. Forrest",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Training Time-Dependence in Neural Networks",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "701--708",
-  year =         "1987",
-}
-
-@InProceedings{Rohwer90,
-  author =       "R. Rohwer",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "The ``Moving Targets'' Training Algorithm",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "558--565",
-  year =         "1990",
-}
-
-@article{Rohde+Plaut-99,
- author = {D.L.T. Rohde and D.C. Plaut},
- title = {Language acquisition in the absence of explicit negative evidence: {H}ow important is starting small?},
- journal = {Cognition}, 
- volume = 72,
- pages = {67--109},
- year = 1999
-}
-
-@PhdThesis{Romeo89,
-  author =       "F. I. Romeo",
-  title =        "Simulated Annealing: Theory and Applications to Layout
-                 Problems",
-  school =       "University of California at Berkeley",
-  year =         "1989",
-  note =         "Memorandum UCB/ERL--M89/29",
-}
-
-@InProceedings{Romer+Frey2003,
-  author =       "R. Rosales and B. Frey",
-  booktitle =    UAI03,
-  title =        "Learning Generative Models of Affinity Matrices",
-  publisher =    "Morgan Kaufmann Publishers",
-  address =      "San Francisco, CA",
-  pages =        "485--492",
-  year =         "2003",
-}
-
-@InProceedings{Ron94,
-  author =       "D. Ron and Y. Singer and N. Tishby",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "The power of amnesia",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "176--183",
-  year =         "1994",
-}
-
-@Article{Ron96,
-  author =       "D. Ron and Y. Singer and N. Tishby",
-  title =        "The power of amnesia: Learning Probabilistic Automata
-                 with Variable Memory Length",
-  journal =      "Machine Learning",
-  volume =       "25",
-  year =         "1996",
-}
-
-@Article{Ron98,
-  author =       "Naftali Tishby {Dana Ron, Yoram Singer}",
-  title =        "On the Learnability and Usage of Acyclic Probabilistic
-                 Finite Automata",
-  journal =      "Journal of Computer and System Sciences",
-  volume =       "56",
-  number =       "2",
-  pages =        "133--152",
-  year =         "1998",
-}
-
-@InProceedings{Roscheisen-nips92,
-  author =       "M. Rvscheisen and R. Hofman and V. Tresp",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Neural Control for Rolling Mills: Incorporating Domain
-                 Theories to Overcome Data Deficiency",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "659--666",
-  year =         "1992",
-}
-
-@Book{Rose85,
-  editor =       "D. Rose and V. G. Dobson",
-  title =        "Models of the Visual Cortex",
-  publisher =    "Wiley",
-  address =      "Chichester",
-  year =         "1985",
-}
-
-@Book{Rosenberg-1997,
-  author =       "S. Rosenberg",
-  title =        "The Laplacian on a Riemannian Manifold",
-  publisher =    "Cambridge University Press",
-  address =      "Cambridge, UK",
-  year =         "1997",
-}
-
-@InCollection{Rosenberg88,
-  author =       "C. R. Rosenberg and G. Blelloch",
-  editor =       "D. Waltz and J. Feldman",
-  booktitle =    "Connectionist Models and their Implications",
-  title =        "An Implementation of Network Learning on the
-                 Connection Machine",
-  publisher =    "Ablex Pub. Corp",
-  address =      "Norwood, NJ",
-  year =         "1988",
-}
-
-@TechReport{Rosenblatt57,
-  author =       "Frank Rosenblatt",
-  title =        "The Perceptron --- a perceiving and recognizing
-                 automaton",
-  number =       "85-460-1",
-  institution =  "Cornell Aeronautical Laboratory",
-  address =      "Ithaca, N.Y.",
-  year =         "1957",
-}
-
-@article{Rosenblatt-1958,
-    author = {Frank Rosenblatt},
-    title = {The perceptron: A probabilistic model for information storage and organization in the brain},
-    journal = {Psychological Review},
-    year = {1958},
-    volume = {65},
-    pages = {386–408},
-}
-
-@Book{Rosenblatt62,
-  author =       "Frank Rosenblatt",
-  title =        "Principles of Neurodynamics",
-  publisher =    "Spartan",
-  address =      "New York",
-  year =         "1962",
-}
-
-@Article{rosenfeld02whole,
-  author =       "Ronald Rosenfeld and Stanley F. Chen and Xiaojin Zhu",
-  title =        "Whole-Sentence Exponential Language Models: {A}
-                 Vehicle For Linguistic-Statistical Integration",
-  journal =      CSL,
-  volume =       "15",
-  number =       "1",
-  year =         "2001",
-  URL =          "citeseer.nj.nec.com/448532.html",
-}
-
-@Article{Rosenfeld2000,
-  author =       "Ronald Rosenfeld",
-  title =        "Two decades of Statistical Language Modeling: Where Do
-                 We Go From Here?",
-  journal =      "Proceedings of the {IEEE}",
-  volume =       "88",
-  number =       "8",
-  pages =        "1270--1278",
-  year =         "2000",
-}
-
-@InProceedings{Rosipal2003,
-  author =       "R. Rosipal and L. J. Trejo and B. Matthews",
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  title =        "Kernel {PLS}-{SVC} for Linear and Nonlinear
-                 Classification",
-  year =         "2003",
-}
-
-@PhdThesis{Rossen89,
-  author =       "M. L. Rossen",
-  title =        "Speech Syllable Recognition with a Neural Network",
-  school =       "Brown University",
-  year =         "1989",
-}
-
-@Article{Rost93,
-  author =       "B. Rost and C. Sander",
-  title =        "Improved prediction of protein secondary structure by
-                 use of sequence profiles and neural networks",
-  journal =      "Proc. Nat. Ac. Sci. USA",
-  volume =       "90",
-  pages =        "7558--7562",
-  year =         "1993",
-}
-
-@Article{Rost94,
-  author =       "B. Rost and C. Sander",
-  title =        "Combining evolutionary information and neural networks
-                 to predict protein secondary structure",
-  journal =      "Proteins",
-  volume =       "19",
-  pages =        "55--72",
-  year =         "1994",
-}
-
-@InProceedings{RothBlack2005,
-  author =       "Stefan Roth and Michael J. Black",
-  booktitle =    cvpr05,
-  title =        "Fields of Experts: a framework for learning image
-                 priors",
-  volume =       "2",
-  number =       "",
-  pages =        "860--867",
-  year =         "2005",
-}
-
-@InProceedings{Roweis+Saul+Hinton-2002,
-  author =       "S. Roweis and L. Saul and G. Hinton",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Global coordination of local linear models",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-}
-
-% DEPRECATED, USE THE ONE BELOW
-@Article{roweis00lle,
-  author =       "Sam Roweis and Lawrence K. Saul",
-  title =        "Nonlinear dimensionality reduction by locally linear
-                 embedding",
-  journal =      "Science",
-  volume =       "290",
-  number =       "5500",
-  pages =        "2323--2326",
-  month =        dec,
-  year =         "2000",
-}
-
-@Article{Roweis2000-lle,
-  author =       "Sam Roweis and Lawrence K. Saul",
-  title =        "Nonlinear dimensionality reduction by locally linear
-                 embedding",
-  journal =      "Science",
-  volume =       "290",
-  number =       "5500",
-  pages =        "2323--2326",
-  month =        dec,
-  year =         "2000",
-}
-
-@TechReport{roweis97unifying,
-  author =       "Sam Roweis and Zoubin Ghahramani",
-  title =        "A Unifying Review of Linear {G}aussian Models",
-  address =      "6 King's College Road, Toronto M5S 3H5, Canada",
-  year =         "1997",
-  URL =          "citeseer.nj.nec.com/article/roweis97unifying.html",
-}
-
-@InProceedings{roweis98em,
-  author =       "Sam Roweis",
-  editor =       NIPS10ed,
-  booktitle =    NIPS10,
-  title =        "{EM} Algorithms for {PCA} and {SPCA}",
-  volume =       "10",
-  publisher =    "{MIT} Press",
-  year =         "1998",
-  URL =          "citeseer.nj.nec.com/roweis98em.html",
-}
-
-@InProceedings{RoweisNCA2005,
-  author =       "Jacob Goldberger and Sam Roweis and Geoffrey E. Hinton and Ruslan
-                 Salakhutdinov",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Neighbourhood Components Analysis",
-  publisher =    "{MIT} Press",
-  year =         "2005",
-}
-
-@Book{Rubinstein1981,
-  author =       "Reuven Y. Rubinstein",
-  title =        "Simulation and the Monte Carlo Method",
-  publisher =    "John Wiley \& Sons",
-  year =         "1981",
-}
-
-@Article{Rubner89,
-  author =       "J. Rubner and P. Tavan",
-  title =        "A Self-Organizing Network for Principal-Component
-                 Analysis",
-  journal =      eul,
-  volume =       "10",
-  pages =        "693--698",
-  year =         "1989",
-}
-
-
-@Article{Rubner90,
-  author =       "J. Rubner and K. Schulten",
-  title =        "Development of Feature Detectors by
-                 Self-Organization",
-  journal =      biocyb,
-  volume =       "62",
-  pages =        "193--199",
-  year =         "1990",
-}
-
-@Article{Rumelhart85,
-  author =       "D. E. Rumelhart and D. Zipser",
-  title =        "Feature Discovery by Competitive Learning",
-  journal =      cogsci,
-  volume =       "9",
-  pages =        "75--112",
-  year =         "1985",
-  note =         "Reprinted in \cite[chapter 5]{Rumelhart86a}",
-}
-
-@Book{Rumelhart86a,
-  author =       "D. E. Rumelhart and J. L. McClelland and the PDP
-                 Research Group",
-  title =        "Parallel Distributed Processing: Explorations in the
-                 Microstructure of Cognition",
-  volume =       "1",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1986",
-}
-
-@Article{Rumelhart86b,
-  author =       "David E. Rumelhart and Geoffrey E. Hinton and Ronald J. Williams",
-  title =        "Learning Representations by Back-Propagating Errors",
-  journal =      "Nature",
-  volume =       "323",
-  pages =        "533--536",
-  year =         "1986",
-}
-
-@InCollection{Rumelhart86c,
-  author =       "D. E. Rumelhart and G. E. Hinton and R. J. Williams",
-  editor =       "D. E. Rumelhart and J. L. McClelland",
-  booktitle =    pdp,
-  title =        "Learning Internal Representations by Error
-                 Propagation",
-  chapter =      "8",
-  volume =       "1",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  pages =        "318--362",
-  year =         "1986",
-}
-
-@InProceedings{Russ+Geoff-nips-2007,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  editor =       NIPS20ed,
-  booktitle =    NIPS20,
-  title =        "Using Deep Belief Nets to Learn Covariance Kernels for
-                 {Gaussian} Processes",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "1249--1256",
-  year =         "2008",
-}
-  %url =          "http://www.csri.utoronto.ca/~hinton/absps/dbngp.pdf",
-
-@InProceedings{Russ+Geoff-nips-2007-small,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  booktitle =    "NIPS 20",
-  title =        "Using {D}eep {B}elief {N}ets to Learn Covariance Kernels for
-                 {G}aussian Processes",
-  year =         "2008",
-}
-
-@InProceedings{Russ+Geoff-nips-2007-short,
-  author =       "R. Salakhutdinov and G.E. Hinton",
-  booktitle =    "Adv. Neural Inf. Proc. Sys. 20",
-  title =        "Using {D}eep {B}elief {N}ets to Learn Covariance Kernels for
-                 {G}aussian Processes",
-  pages = {1249--1256},
-  year =         "2008",
-}
-
-@article{rust:2005,
-    author      = {Nicole Rust and Odelia Schwartz and J. Anthony Movshon and Eero Simoncelli},
-    title       = {Spatiotemporal Elements of Macaque {V1} Receptive Fields},
-    journal     = {Neuron},
-    volume      = {46},
-    number      = {6},
-    pages       = {945-956},
-    year        = {2005}
-}
-@article{rust:2006,
-    author = {Nicole C. Rust and Valerio Mante and Eero P. Simoncelli and J.
-        Anthony Movshon},
-    year = {2006},
-    title = {How MT Cells Analyze the Motion of Visual Patterns},
-    journal = {Nature Neuroscience},
-    volume = {9},
-    number = {11},
-    pages = {1421-1431},
-}
-
-@Article{RYsed98,
-  author =       "Eric Sven Ristad and Peter N. Yianilos",
-  title =        "Learning String Edit Distance",
-  journal =      "IEEE Transactions on Pattern Recognition and Machine
-                 Intelligence",
-  month =        may,
-  year =         "1998",
-}
-
-@Book{Saad-1996,
-  author =       "Y. Saad",
-  title =        "Iterative Methods for Sparse Linear Systems",
-  publisher =    "{PWS} Publishing Company",
-  address =      "Boston, MA",
-  year =         "1996",
-}
-
-@TechReport{Saad90a,
-  author =       "D. Saad and E. Marom",
-  title =        "Learning by Choice of Internal Representations --- An
-                 Energy Minimization Approach",
-  type =         "Preprint",
-  institution =  "Faculty of Engineering, Tel Aviv University",
-  address =      "Ramat-Aviv, Israel",
-  year =         "1990",
-}
-
-@TechReport{Saad90b,
-  author =       "D. Saad and E. Marom",
-  title =        "Training Feed Forward Nets with Binary Weights via a
-                 Modified {CHIR} Algorithm",
-  type =         "Preprint",
-  institution =  "Faculty of Engineering, Tel Aviv University",
-  address =      "Ramat-Aviv, Israel",
-  year =         "1990",
-}
-
-@Book{SaadOnlineLearning1999,
-  editor =       "David Saad",
-  title =        "On-Line Learning in Neural Networks",
-  publisher =    "Cambridge University Press",
-  year =         "1999",
-}
-
-@Article{Sachs+Young80,
-  author =       "M. B. Sachs and E. D. Young",
-  title =        "Effects of nonlinearities on speech encoding in the
-                 auditory nerve",
-  journal =      jasa,
-  volume =       "68",
-  number =       "3",
-  pages =        "858--875",
-  year =         "1980",
-}
-
-@Article{Sakoe78,
-  author =       "H. Sakoe and C. Chiba",
-  title =        "Dynamic Programming Algorithm Optimization for Spoken
-                 Word Recognition",
-  journal =      ieeetassp,
-  volume =       "26",
-  number =       "1",
-  pages =        "43--49",
-  month =        feb,
-  year =         "1978",
-}
-
-@InProceedings{Salakhutdinov-2010,
-    author = {Ruslan Salakhutdinov},
-     title = {Learning in {M}arkov Random Fields using Tempered Transitions},
-      year = {2010},
-  crossref = {NIPS22}
-}
-
-@InProceedings{Salakhutdinov+Hinton2007,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  booktitle =    "Proceedings of the 2007 Workshop on Information
-                 Retrieval and applications of Graphical Models (SIGIR
-                 2007)",
-  title =        "Semantic Hashing",
-  year =         "2007",
-  publisher  =   "Elsevier",
-  address = {Amsterdam},
-}
-
-@InProceedings{Salakhutdinov+Hinton2007-small,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  booktitle =    "SIGIR",
-  title =        "Semantic Hashing",
-  year =         "2007",
-}
-
-@InProceedings{SalakhutdinovR2007,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  booktitle =    aistats07,
-  title =        "Learning a Nonlinear Embedding by Preserving Class
-                 Neighbourhood Structure",
-  publisher =    "Omnipress",
-  date =         "March 21-24, 2007",
-  address =      "San Juan, Porto Rico",
-  year =         "2007",
-}
-
-@InProceedings{SalakhutdinovR2007-small,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  booktitle =    aistats07-small,
-  title =        "Learning a Nonlinear Embedding by Preserving Class
-                 Neighbourhood Structure",
-  year =         "2007",
-}
-
-@InProceedings{SalakhutdinovR2007-short,
-  author =       "R. Salakhutdinov and G.E. Hinton",
-  booktitle =    {AI \& Stat.'2007},
-  title =        "Learning a Nonlinear Embedding by Preserving Class
-                 Neighbourhood Structure",
-  year =         "2007",
-}
-
-@InProceedings{SalakhutdinovR2007b,
-  author =       "Ruslan Salakhutdinov and Andriy Mnih and Geoffrey E.
-                 Hinton",
-  booktitle =    ICML07,
-  editor =       ICML07ed,
-  publisher =    ICML07publ,
-  title =        "Restricted {Boltzmann} machines for collaborative
-                 filtering",
-  address =      "New York, NY, USA",
-  pages =        "791--798",
-  year =         "2007",
-  location =     "Corvalis, Oregon",
-}
-
-@InProceedings{SalakhutdinovR2007b-small,
-  author =       "Ruslan Salakhutdinov and Andriy Mnih and Geoffrey E. Hinton",
-  booktitle =    "ICML 2007",
-  title =        "Restricted {Boltzmann} machines for collaborative
-                 filtering",
-  year =         "2007",
-}
-
-@InProceedings{SalakhutdinovR2007b-short,
-  author =       "R. Salakhutdinov and A. Mnih and G.E. Hinton",
-  booktitle =    "Int. Conf. Mach. Learn. 2007",
-  title =        "Restricted {Boltzmann} machines for collaborative
-                 filtering",
-  pages =        "791--798",
-  year =         "2007",
-}
-
-
-@InProceedings{Salakhutdinov+Murray-2008,
-    title =     "On the Quantitative Analysis of Deep Belief Networks",
-    author =    "Ruslan Salakhutdinov and Iain Murray",
-    booktitle = ICML08,
-    editor =    ICML08ed,
-    publisher = ICML08publ,
-    pages =     "872--879",
-    year =      "2008",
-    volume =    "25",
-}
-
-@InProceedings{Salakhutdinov+Hinton-2009,
-  author =       "Ruslan Salakhutdinov and Geoffrey E. Hinton",
-  booktitle =    aistats09,
-  title =        "Deep {Boltzmann} Machines",
-  year =         "2009",
-  volume =       5,
-  location =     "Clearwater (Florida), USA",
-  date =         "April 16-18, 2009",
-  pages =        "448--455",
-}
-
-@Article{Salamon88,
-  author =       "P. Salamon and J. D. Nulton and J. Robinson and J.
-                 Petersen and G. Ruppeiner and L. Liao",
-  title =        "Simulated Annealing with Constant Thermodynamic
-                 Speed",
-  journal =      cpc,
-  volume =       "49",
-  pages =        "423--428",
-  year =         "1988",
-}
-
-@Article{Salton+Buckley88,
-  author =       "G. Salton and C. Buckley",
-  title =        "Term weighting approaches in automatic text
-                 retrieval",
-  journal =      "Information Processing and Management",
-  volume =       "24",
-  number =       "5",
-  pages =        "513--523",
-  year =         "1988",
-}
-
-@Article{Sanger89a,
-  author =       "T. D. Sanger",
-  title =        "Optimal Unsupervised Learning in a Single-Layer Linear
-                 Feedforward Neural Network",
-  journal =      nn,
-  volume =       "2",
-  pages =        "459--473",
-  year =         "1989",
-}
-
-@InProceedings{Sanger89b,
-  author =       "T. D. Sanger",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "An Optimality Principle for Unsupervised Learning",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "11--19",
-  year =         "1989",
-}
-
-@article{Sanger-1994,
- author = {Terence D. Sanger},
- title = {Neural network learning control of robot manipulators 
-      using gradually increasing task difficulty},
- journal = {{IEEE} Transactions on Robotics and Automation},
- volume = 10,
- number = 3,
- year = 1994,
-}
-
-@article{Sanger-1994-small,
- author = {Terence D. Sanger},
- title = {Neural network learning control of robot manipulators 
-      using gradually increasing task difficulty},
- journal = {{IEEE} Trans. on Robotics and Automation},
- volume = 10,
- number = 3,
- year = 1994,
-}
-
-@InProceedings{sarawagi03,
-  author =       "Sunita Sarawagi and Soumen Chakrabarti and Shantanu
-                 Godbole",
-  booktitle =    "KDD '03: Proceedings of the ninth ACM SIGKDD
-                 international conference on Knowledge discovery and
-                 data mining",
-  title =        "Cross-training: learning probabilistic mappings
-                 between topics",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "177--186",
-  year =         "2003",
-  location =     "Washington, D.C.",
-}
-
-@article{Sarkar-Moore-2005,
- author = {P. Sarkar and A. Moore},
- title = {Dynamic social network analysis using latent space models},
- journal = {{SIGKDD} Explorations},
- volume = 7,
- number = 2,
- pages = {31--40},
- year = 2005,
-}
-
-@Article{Sato90,
-  author =       "M. Sato",
-  title =        "A Real Time Learning Algorithm for Recurrent Analog
-                 Neural Networks",
-  journal =      biocyb,
-  volume =       "62",
-  pages =        "237--241",
-  year =         "1990",
-}
-
-@Article{Saul+96,
-  author =       "Lawrence K. Saul and Tommi Jaakkola and Michael I. Jordan",
-  title =        "Mean field theory for sigmoid belief networks",
-  journal =      "Journal of Artificial Intelligence Research",
-  volume =       "4",
-  pages =        "61--76",
-  year =         "1996",
-}
-
-@Article{Saul+Roweis-2002,
-  author =       "L. Saul and S. Roweis",
-  title =        "Think globally, fit locally: unsupervised learning of
-                 low dimensional manifolds",
-  journal =      jmlr,
-  volume =       "4",
-  number =       "",
-  pages =        "119--155",
-  month =        "",
-  year =         "2002",
-}
-
-@InProceedings{Saul95,
-  author =       "Lawrence K. Saul and Michael I. Jordan",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        {Boltzmann Chains and Hidden Markov Models},
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "435--442",
-  year =         "1995",
-}
-
-@InProceedings{Saul96,
-  author =       "Lawrence K. Saul and Michael I. Jordan",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Exploiting tractable substructures in intractable
-                 networks",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@InProceedings{SaulJordan97,
-  author =       "Lawrence K. Saul and Michael I. Jordan",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "A variational model for model-based interpolation",
-  publisher =    "MIT Press",
-  pages =        "375",
-  year =         "1997",
-}
-
-@Article{Saund-1989,
-  author =       "Eric Saund",
-  title =        "Dimensionality-reduction using connectionist
-                 networks",
-  journal =      "{IEEE} Transactions on Pattern Analysis and Machine
-                 Intelligence",
-  volume =       "11",
-  number =       "3",
-  pages =        "304--314",
-  year =         "1989",
-}
-
-@InCollection{Scalettar88,
-  author =       "R. Scalettar and A. Zee",
-  editor =       "D. Waltz and J. A. Feldman",
-  booktitle =    "Connectionist Models and Their Implications: Readings
-                 from Cognitive Science",
-  title =        "Emergence of Grandmother Memory in Feed Forward
-                 Networks: Learning with Noise and Forgetfulness",
-  publisher =    "Ablex",
-  address =      "Norwood",
-  pages =        "309--332",
-  year =         "1988",
-}
-
-@Article{schapire-90,
-  author =       "Robert E. Schapire",
-  title =        "The strength of weak learnability",
-  journal =      "Machine Learning",
-  volume =       "5",
-  number =       "2",
-  pages =        "197--227",
-  year =         "1990",
-}
-
-@Article{Schapire-margin98,
-  author =       "Robert E. Schapire and Yoav Freund and Peter Bartlett
-                 and Wee Sun Lee",
-  title =        "Boosting the margin: {A} new explanation for the
-                 effectiveness of voting methods",
-  journal =      "The Annals of Statistics",
-  volume =       "26",
-  number =       "5",
-  pages =        "1651--1686",
-  year =         "1998",
-}
-
-@InProceedings{schapire99theoretical,
-  author =       "Robert E. Schapire",
-  booktitle =    "Algorithmic Learning Theory, 10th International
-                 Conference, {ALT} '99, Tokyo, Japan, December 1999,
-                 Proceedings",
-  title =        "Theoretical Views of Boosting and Applications",
-  volume =       "1720",
-  publisher =    "Springer",
-  pages =        "13--25",
-  year =         "1999",
-  URL =          "http:citeseer.ist.psu.edu/article/schapire99theoretical.html",
-}
-
-@InProceedings{SchapireSinger98,
-  author =       "R. E. Schapire and Y. Singer",
-  booktitle =    "Proceedings of the 11th Annual Conference on
-                 Computational Learning Theory",
-  title =        "Improved Boosting Algorithms Using Confidence Rated
-                 Predictions",
-  year =         "1998",
-}
-
-@Book{SchBurSmo99,
-  author =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
-  title =        "Advances in Kernel Methods --- Support Vector
-                 Learning",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "1999",
-}
-
-@InProceedings{ScheinA2001,
-  author =       "Andrew I. Schein and Alexandrin Popescul and Lyle H.
-                 Ungar and David M. Pennock",
-  booktitle =    "Workshop on Recommender Systems at SIGIR",
-  title =        "Generative Models for Cold-Start Recommendations",
-  year =         "2001",
-}
-
-@InProceedings{ScheinA2002,
-  author =       "Andrew I. Schein and Alexandrin Popescul and Lyle H.
-                 Ungar and David M. Pennock",
-  booktitle =    "SIGIR '02",
-  title =        "Methods and metrics for cold-start recommendations",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "253--260",
-  year =         "2002",
-}
-
-@InCollection{Scheines94,
-  author =       "R. Scheines",
-  editor =       "P. Cheeseman and R. W. Oldford",
-  booktitle =    "Selecting Models from Data: Artificial Intelligence
-                 and Statistics {IV}",
-  title =        "Inferring causal structure among unmeasured
-                 variables",
-  publisher =    "Springer-Verlag",
-  pages =        "197--204",
-  year =         "1994",
-}
-
-@InProceedings{Schenkel93,
-  author =       "M. Schenkel and H. Weissman and I. Guyon and C. Nohl
-                 and D. Henderson",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Recognition-Based Segmentation of On-Line Hand-Printed
-                 Words",
-  address =      "Denver, CO",
-  pages =        "723--730",
-  year =         "1993",
-}
-
-@Article{schenkel95,
-  author =       "M. Schenkel and I. Guyon and D. Henderson",
-  title =        "On-line Cursive Script Recognition using Time Delay
-                 Neural Networks and Hidden {Markov} Models",
-  journal =      "{Machine} {Vision} and {Applications}",
-  publisher =    "Springer Verlag",
-  pages =        "215--223",
-  year =         "1995",
-}
-
-@InProceedings{SchGra03,
-  author =       "Nicol N. Schraudolph and Thore Graepel",
-  editor =       "Christopher M. Bishop and Brendan J. Frey",
-  booktitle =    "Proc.\ 9th Intl.\ Workshop Artificial Intelligence and
-                 Statistics (AIstats)",
-  title =        "Combining Conjugate Direction Methods with Stochastic
-                 Approximation of Gradients",
-  publisher =    "Society for Artificial Intelligence and Statistics",
-  address =      "Key West, Florida",
-  pages =        "7--13",
-  year =         "2003",
-  ISBN =         "0-9727358-0-1",
-  abstract =     "The method of conjugate directions provides a very
-                 effective way to optimize large, deterministic systems
-                 by gradient descent. In its standard form, however, it
-                 is not amenable to stochastic approximation of the
-                 gradient. Here we explore ideas from conjugate gradient
-                 in the stochastic (online) setting, using fast
-                 Hessian-gradient products to set up low-dimensional
-                 Krylov subspaces within individual mini-batches. In our
-                 benchmark experiments the resulting online learning
-                 algorithms converge orders of magnitude faster than
-                 ordinary stochastic gradient descent.",
-}
-
-@Article{Schmidhuber92,
-  author =       "J{\"u}rgen Schmidhuber",
-  title =        "Learning Complex, Extended Sequences using the
-                 Principle of History Compression",
-  journal =      nc,
-  volume =       "4",
-  number =       "2",
-  pages =        "234--242",
-  year =         "1992",
-}
-
-@Article{Schmidhuber96,
-  author =       "J{\"u}rgen Schmidhuber",
-  title =        "Sequential Neural Text Compression",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "7",
-  number =       "1",
-  pages =        "142--146",
-  year =         "1996",
-}
-
-@InCollection{Schmidt-2006,
-  author =       "Volker Schmidt",
-  booktitle =    "Lecture Notes, Summer 2006",
-  title =        {Markov Chains and Monte-Carlo Simulation},
-  address =      "Ulm University, Department of Stochastics",
-  year =         "2006",
-  URL =          "http://www.mathematik.uni-ulm.de/stochastik/lehre/ss06/markov/skript-engl/skript-engl.htm",
-}
-
-@Article{Schmitt-2002,
-  author =       "M. Schmitt",
-  title =        "Descartes' Rule of Signs for Radial Basis Function
-                 Neural Networks",
-  journal =      "Neural Computation",
-  volume =       "14",
-  number =       "12",
-  pages =        "2997--3011",
-  year =         "2002",
-}
-
-@Article{Schneider-2001,
-  author =       "Tapio Schneider",
-  title =        "Analysis of Incomplete Climate Data: Estimation of
-                 Mean Values and Covariance Matrices and Imputation of
-                 Missing Values",
-  journal =      "Journal of Climate",
-  volume =       "14",
-  pages =        "853--871",
-  year =         "2001",
-}
-
-@article{Schneidman+al-2003,
-    address = {Department of Molecular Biology, Princeton University, Princeton, New Jersey 08544, USA.},
-    author = {Schneidman, E.  and Bialek, W.  and Berry, M. J. },
-    issn = {1529-2401},
-    journal = {Journal of Neuroscience},
-    month = {December},
-    number = {37},
-    pages = {11539--11553},
-    title = {Synergy, redundancy, and independence in population codes},
-    url = {http://www.jneurosci.org/cgi/content/abstract/23/37/11539},
-    volume = {23},
-    year = {2003}
-}
-    
-
-@Article{schoelkopf97comparing,
-  author =       "B. Sch{\"o}lkopf and K. Sung and C. Burges and F.
-                 Girosi and P. Niyogi and T. Poggio and V. Vapnik",
-  title =        "Comparing support vector machines with {G}aussian
-                 kernels to radial basis function classifiers",
-  journal =      "IEEE Transactions on Signal Processing",
-  volume =       "45",
-  pages =        "2758--2765",
-  year =         "1997",
-  text =         "Sch{\"o}lkopf, B., Sung, K., Burges, C., Girosi, F.,
-                 Niyogi, P., Poggio, T., and Vapnik, V.: Comparing
-                 support vector machines with {G}aussian kernels to radial
-                 basis function classifiers. IEEE Transactions on Signal
-                 Processing, 45 (1997) 2758-2765.",
-}
-
-@Book{Scholkopf02-book,
-  author =       "B. Sch{\"o}lkopf and A. J. Smola",
-  title =        "Learning with Kernels: Support Vector Machines,
-                 Regularization, Optimization and Beyond",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-}
-
-@TechReport{Scholkopf96,
-  author =       "B. Sch{\"o}lkopf and A. Smola and K.-R. M{\"u}ller",
-  title =        "Nonlinear Component Analysis as a Kernel Eigenvalue
-                 Problem",
-  number =       "44",
-  institution =  "Max Planck Institute for Biological Cybernetics,
-                 Tübingen, Germany",
-  year =         "1996",
-}
-
-@Article{Scholkopf98,
-  author =       "B. Sch{\"o}lkopf and A. Smola and K.-R. M{\"u}ller",
-  title =        "Nonlinear component analysis as a kernel eigenvalue
-                 problem",
-  journal =      "Neural Computation",
-  volume =       "10",
-  pages =        "1299--1319",
-  year =         "1998",
-}
-
-@Book{Scholkopf98-book,
-  author =       "B. Sch{\"o}lkopf and C. J. C. Burges and A. J. Smola",
-  title =        "Advances in kernel methods: support vector learning",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "1998",
-}
-
-@Article{Scholkopf99,
-  author =       "B. Sch{\"o}lkopf and S. Mika and C. Burges and P.
-                 Knirsch and K.-R. M{\"u}ller and G. R{\"a}tsch and A.
-                 Smola",
-  title =        "Input Space Versus Feature Space in Kernel-Based Methods",
-  journal =      "IEEE Trans. Neural Networks",
-  volume =       "10",
-  number =       "5",
-  pages =        "1000--1017",
-  year =         "1999",
-}
-
-@Article{Schraudolph02,
-  author =       "Nicol N. Schraudolph",
-  title =        "Fast Curvature Matrix-Vector Products for Second-Order
-                 Gradient Descent",
-  journal =      "Neural Computation",
-  volume =       "14",
-  number =       "7",
-  pages =        "1723--1738",
-  year =         "2002",
-}
-
-@InProceedings{Schraudolph99,
-  author =       "Nicol N. Schraudolph",
-  booktitle =    "Proceedings of the 9th International Conference on
-                 Artificial Neural Networks",
-  title =        "Local gain adaptation in stochastic gradient descent",
-  pages =        "569--574",
-  year =         "1999",
-}
-
-@InProceedings{Schutze92,
-  author =       "Hinrich Sch{\"u}tze",
-  booktitle =    "Supercomputing'92",
-  title =        "Dimensions of Meaning",
-  address =      "Minneapolis MN",
-  pages =        "787--796",
-  year =         "1992",
-}
-
-@InProceedings{Schutze93,
-  author =       "H. Schutze",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Word space",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  pages =        "895--902",
-  year =         "1993",
-}
-
-@Misc{Schuurmans1999,
-  author =       "Dale Schuurmans",
-  title =        "Greedy importance sampling: {A} new Monte Carlo
-                 inference method",
-  year =         "1999",
-  URL =          "citeseer.nj.nec.com/25013.html",
-}
-
-@InProceedings{Schuurmans2000,
-  author =       "Dale Schuurmans and Finnegan Southey",
-  title =        "Monte Carlo inference via greedy importance sampling",
-  pages =        "523--532",
-  year =         "2000",
-  URL =          "citeseer.nj.nec.com/281712.html",
-}
-
-@Article{Schuurmans2001,
-  author =       "D. Schuurmans and F. Southey",
-  title =        "Metric-based methods for adaptive model selection and
-                 regularization",
-  journal =      "Machine Learning",
-  volume =       "48",
-  number =       "1",
-  pages =        "51--84",
-  year =         "2002",
-}
-
-@InProceedings{Schuurmans97,
-  author =       "D. Schuurmans",
-  booktitle =    "Proceedings of the National Conference on Artificial
-                 Intelligence (AAAI-97)",
-  title =        "A new metric-based approach to model selection",
-  pages =        "552--558",
-  year =         "1997",
-}
-
-@Article{Schwartz90,
-  author =       "D. B. Schwartz and V. K. Samalam and S. A. Solla and
-                 J. S. Denker",
-  title =        "Exhaustive Learning",
-  journal =      nc,
-  volume =       "2",
-  pages =        "371--382",
-  year =         "1990",
-}
-
-@Article{Schwenk+Bengio00,
-  author =       "Holger Schwenk and Yoshua Bengio",
-  title =        "Boosting Neural Networks",
-  journal =      "Neural Computation",
-  volume =       "12",
-  number =       "8",
-  pages =        "1869--1887",
-  year =         "2000",
-}
-
-@InProceedings{Schwenk+Gauvain-2005,
-  author =       "Holger Schwenk and Jean-Luc Gauvain",
-  booktitle =    "Interspeech",
-  title =        "Building continuous space language models for
-                 transcribing European languages",
-  pages =        "737--740",
-  year =         "2005",
-}
-
-@InProceedings{Schwenk+Gauvain2002,
-  author =       "H. Schwenk and J-L. Gauvain",
-  booktitle =    icassp,
-  title =        "Connectionist Language Modeling for Large Vocabulary
-                 Continuous Speech Recognition",
-  address =      "Orlando, Florida",
-  pages =        "765--768",
-  year =         "2002",
-}
-
-@InProceedings{Schwenk+Gauvain2002-short,
-  author =       "H. Schwenk and J-L. Gauvain",
-  booktitle =    {Int. Conf. Acoust. Speech \& Sig. Proc.},
-  title =        "Connectionist Language Modeling for Large Vocabulary
-                 Continuous Speech Recognition",
-  address =      "Orlando, Florida",
-  pages =        "765--768",
-  year =         "2002",
-}
-
-@InProceedings{Schwenk05C,
-  author =       "Holger Schwenk and Jean-Luc Gauvain",
-  booktitle =    "Joint Human Language Technology Conference and
-                 Conference on Empirical Methods in Natural Language
-                 Processing (EMNLP)",
-  title =        "Training Neural Network Language Models On Very Large
-                 Corpora",
-  address =      "Vancouver",
-  pages =        "201--208",
-  month =        oct,
-  year =         "2005",
-  URL =          "ftp://tlp.limsi.fr/public/emnlp05.pdf",
-}
-
-@InProceedings{Schwenk05C-small,
-  author =       "Holger Schwenk and Jean-Luc Gauvain",
-  booktitle =    "EMNLP'2005",
-  title =        "Training Neural Network Language Models On Very Large
-                 Corpora",
-  pages =        "201--208",
-  year =         "2005",
-}
-
-@TechReport{Schwenk:2001:tr,
-  author =       "Holger Schwenk",
-  title =        "Language Modeling in the Continuous Domain",
-  number =       "2001-20",
-  institution =  "LIMSI-CNRS, Orsay, France",
-  year =         "2001",
-  date =         "dec 2001",
-}
-
-@InProceedings{Schwenk:2002:icassp,
-  author =       "Holger Schwenk and Jean-Luc Gauvain",
-  booktitle =    icassp,
-  title =        "Connectionist Language Modeling for Large Vocabulary
-                 Continuous Speech Recognition",
-  volume =       "1",
-  pages =        "765--768",
-  year =         "2002",
-}
-
-@InProceedings{Schwenk:2003:sspr,
-  author =       "Holger Schwenk and Jean-Luc Gauvain",
-  booktitle =    "ISCA \& IEEE Workshop on Spontaneous Speech Processing
-                 and Recognition",
-  title =        "{Using Continuous Space Language Models for
-                 Conversational Speech Recognition}",
-  address =      "Tokyo",
-  month =        apr,
-  year =         "2003",
-}
-
-@InProceedings{Schwenk:2004:icslp,
-  author =       "Holger Schwenk and Jean-Luc Gauvain",
-  booktitle =    icslp,
-  title =        "Using a Continuous Space Language Model for
-                 Conversational Speech Recognition",
-  year =         "2004",
-  note =         "submitted",
-}
-
-@InProceedings{Schwenk:2004:ijcnn,
-  author =       "Holger Schwenk",
-  booktitle =    ijcnn,
-  title =        "Efficient Training of Large Neural Networks for
-                 Language Modeling",
-  volume =       "4",
-  pages =        "3050--3064",
-  year =         "2004",
-}
-
-@InProceedings{SchYuGue07,
-  author =       "Nicol N. Schraudolph and Jin Yu and Simon G{\"u}nter",
-  booktitle =    "Proc.\ 11th Intl.\ Conf.\ Artificial Intelligence and
-                 Statistics (AIstats)",
-  title =        "A Stochastic Quasi-{Newton} Method for Online Convex
-                 Optimization",
-  publisher =    "Society for Artificial Intelligence and Statistics",
-  address =      "San Juan, Puerto Rico",
-  pages =        "433--440",
-  year =         "2007",
-  ISBN =         "0-9727358-2-8",
-}
-
-@InProceedings{Scofield88,
-  author =       "C. L. Scofield",
-  booktitle =    icnn,
-  title =        "Learning Internal Representations in the Coulomb
-                 Energy Network",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "271--276",
-  year =         "1988",
-}
-
-@InProceedings{Scott+al-2003,
-  author =       "Scott S. L. Piao and Paul Rayson and Dawn Archer and
-                 Andrew Wilson and Tony McEnery",
-  booktitle =    "Proceedings of the ACL 2003 workshop on Multiword
-                 expressions",
-  title =        "Extracting multiword expressions with a semantic
-                 tagger",
-  publisher =    "Association for Computational Linguistics",
-  address =      "Morristown, NJ, USA",
-  pages =        "49--56",
-  year =         "2003",
-}
-
-@Book{Scott92,
-  author =       "D. W. Scott",
-  title =        "Multivariate Density Estimation: Theory, Practice, and
-                 Visualization",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1992",
-}
-
-@Article{ScST95,
-  author =       "A. Schaerf and S. Yoav and M. Tennenholtz",
-  title =        "Adaptive load balancing: a study in multi-agent
-                 learning",
-  journal =      "Journal of Artificial Intelligence Research",
-  volume =       "2",
-  pages =        "475--500",
-  year =         "1995",
-}
-
-@Article{Scudder65,
-  author = 	 "{Henry J. Scudder, III}",
-  title = 	 {Probability of Error of Some Adaptive Pattern-Recognition Machines},
-  journal = 	 {IEEE Transactions on Information Theory},
-  year = 	 1965,
-  volume =	 11,
-  pages =	 {363-371}
-}
-
-@TechReport{Seeger-2005,
-  author =       "Matthias Seeger",
-  title =        "Low Rank Updates for the {Cholesky} Decomposition",
-  institution =  "Department of EECS, University of California at
-                 Berkeley",
-  year =         "2005",
-}
-
-@InProceedings{Seeger-Williams-Lawrence-2003,
-  author =       "M. Seeger and C. Williams and N. Lawrence",
-  booktitle =    "Workshop on AI and Statistics",
-  title =        "Fast Forward Selection to Speed Up Sparse {G}aussian
-                 Process Regression",
-  volume =       "9",
-  year =         "2003",
-}
-
-@TechReport{Seeger2001,
-  author =       "M. Seeger",
-  title =        "Learning with labeled and unlabeled data",
-  institution =  "Edinburgh University",
-  year =         "2001",
-}
-
-@InProceedings{seidl91p1,
-  author =       "D. R. Seidl and D. Lorenz",
-  booktitle =    ijcnn,
-  title =        "A structure by which a recurrent neural network can
-                 approximate a nonlinear dynamic system",
-  volume =       "2",
-  pages =        "709--714",
-  month =        jul,
-  year =         "1991",
-}
-
-@TechReport{Sejnowski+Rosenberg86,
-  author =       "T. J. Sejnowski and C. R. Rosenberg",
-  key =          "Sejnowski",
-  title =        "{\em NETtalk: A parallel network that learns to read
-                 aloud}",
-  type =         "Technical Report 86-01",
-  institution =  "Department of Electrical Engineering and Computer
-                 Science, Johns Hopkins University, Baltimore, MD.",
-  year =         "1986",
-}
-
-@Article{Sejnowski86,
-  author =       "T. J. Sejnowski and P. K. Kienker and G. Hinton",
-  title =        "Learning Symmetry Groups with Hidden Units: Beyond the
-                 Perceptron",
-  journal =      physicaD,
-  volume =       "22",
-  pages =        "260--275",
-  year =         "1986",
-}
-
-@Article{Sejnowski87,
-  author =       "T. J. Sejnowski and C. R. Rosenberg",
-  title =        "Parallel Networks that Learn to Pronounce English
-                 Text",
-  journal =      cs,
-  volume =       "1",
-  pages =        "145--168",
-  year =         "1987",
-}
-
-@InProceedings{Seneff84,
-  author =       "S. Seneff",
-  booktitle =    icassp,
-  title =        "Pitch and spectral estimation of speech based on an
-                 auditory synchrony model",
-  pages =        "",
-  year =         "1984",
-}
-
-@TechReport{Seneff85,
-  author =       "S. Seneff",
-  title =        "Pitch and spectral estimation of speech based on an
-                 auditory synchrony model",
-  number =       "RLE Technical report no. 504",
-  institution =  "LRE",
-  address =      "Cambridge, MA: MIT Press",
-  year =         "1985",
-}
-
-@InProceedings{Seneff86,
-  author =       "S. Seneff",
-  booktitle =    icassp,
-  title =        "A computational model for the peripheral auditory
-                 system: application to speech recognition research",
-  pages =        "1983--1986",
-  year =         "1986",
-}
-
-@Article{Seneff88,
-  author =       "S. Seneff",
-  title =        "A joint synchrony/mean-rate model of auditory speech
-                 processing",
-  journal =      "Journal of Phonetics",
-  volume =       "16",
-  pages =        "55--76",
-  year =         "1988",
-}
-
-@Book{Seneta-81,
-  author =       "E. Seneta",
-  title =        "Nonnegative Matrices and {Markov} Chains",
-  publisher =    "Springer",
-  address =      "New York",
-  year =         "1981",
-}
-
-@Article{senseval-2000,
-  author =       "Adam Kilgarrif and Joseph Rosenzweig",
-  title =        "Framework and results for English {SENSEVAL}",
-  journal =      "Computers and the Humanities: special issue on
-                 {SENSEVAL}",
-  volume =       "34",
-  pages =        "15--48",
-  year =         "2000",
-}
-
-@Article{Serbedzija-1996,
-  author =       "Nikola B. {\v{S}}erbed{\v{z}}ija",
-  title =        "Simulating Artificial Neural Networks on Parallel
-                 Architectures",
-  journal =      "Computer",
-  volume =       "29",
-  number =       "3",
-  publisher =    "IEEE Computer Society Press",
-  address =      "Los Alamitos, CA, USA",
-  pages =        "56--63",
-  year =         "1996",
-  ISSN =         "0018-9162",
-  doi =          "http://dx.doi.org/10.1109/2.485893",
-}
-
-@Article{Serre2007,
-  author =       "T. Serre and G. Kreiman and M. Kouh and C. Cadieu and
-                 U. Knoblich and T. Poggio",
-  title =        "A quantitative theory of immediate visual
-                 recognition",
-  journal =      "Progress in Brain Research, Computational
-                 Neuroscience: Theoretical Insights into Brain
-                 Function",
-  volume =       "165",
-  pages =        "33--56",
-  year =         "2007",
-}
-
-@Article{Serre2007-small,
-  author =       "T. Serre and G. Kreiman and M. Kouh and C. Cadieu and
-                 U. Knoblich and T. Poggio",
-  title =        "A quantitative theory of immediate visual
-                 recognition",
-  journal =      "Progress in Brain Res., Comput.
-                 Neurosc.",
-  volume =       "165",
-  pages =        "33--56",
-  year =         "2007",
-}
-
-@article{Serre-Wolf-2007,
-  author = {Thomas Serre and Lior Wolf and Stanley Bileschi and Maximilian Riesenhuber},
-  note = {Member-Poggio, Tomaso},
-  title = {Robust Object Recognition with Cortex-Like Mechanisms},
-  journal = {IEEE Trans. Pattern Anal. Mach. Intell.},
-  volume = {29},
-  number = {3},
-  year = {2007},
-  issn = {0162-8828},
-  pages = {411--426},
-  doi = {http://dx.doi.org/10.1109/TPAMI.2007.56},
-  publisher = {IEEE Computer Society},
-  address = {Washington, DC, USA},
-}
-
-
-@INPROCEEDINGS{SeungS1998,
-    author = {Sebastian H. Seung},
-    title = {Learning continuous attractors in recurrent networks},
-    editor =       NIPS10ed,
-    booktitle =    NIPS10,
-    year = {1998},
-    pages = {654--660},
-    publisher = {MIT Press}
-}
-
-@INPROCEEDINGS{Jain-Seung-08,
-    author = {Viren Jain and Sebastian H. Seung},
-    title = {Natural Image Denoising with Convolutional Networks},
-    editor =       NIPS21ed,
-    booktitle =    NIPS21,
-    year = {2008},
-}
-
-@inproceedings{Sha+Saul-2005,
-    author = {Fei Sha and Lawrence K. Saul},
-    title = {Analysis and extension of spectral methods for nonlinear dimensionality reduction},
-    booktitle = {Proceedings of the 22nd International Conference on Machine Learning},
-    year = {2005},
-    isbn = {1-59593-180-5},
-    pages = {784--791},
-    location = {Bonn, Germany},
-    doi = {http://doi.acm.org/10.1145/1102351.1102450},
-    publisher = {ACM},
-    address = {New York, NY},
-}
-
-@article{Shannon-1949,
-    Author = {C. E. Shannon},
-    Title = {Communication in the presence of noise},
-    Journal = {{Proceedings of the Institute of Radio Engineers}},
-    Volume = {37},
-    number = 1,
-    Pages = {10--21},
-    Year = {1949}
-}
-
-@Article{shapiro00lift,
-  author =       "Gregory Piatetsky-Shapiro and Sam Steingold",
-  title =        "Measuring lift quality in database marketing",
-  journal =      "SIGKDD Explor. Newsl.",
-  volume =       "2",
-  number =       "2",
-  publisher =    "ACM Press",
-  address =      "New York, NY, USA",
-  pages =        "76--80",
-  year =         "2000",
-  ISSN =         "1931-0145",
-}
-
-@InProceedings{shardanand95,
-  author =       "Upendra Shardanand and Pattie Maes",
-  booktitle =    "CHI '95: Proceedings of the SIGCHI conference on Human
-                 factors in computing systems",
-  title =        "{Social information filtering: algorithms for
-                 automating ``word of mouth''}",
-  publisher =    "ACM Press/Addison-Wesley Publishing Co.",
-  pages =        "210--217",
-  year =         "1995",
-  location =     "Denver, Colorado, United States",
-}
-
-@article{Sharma-2000,
-    title = {Induction of Visual Orientation Modules in Auditory Cortex},
-    author = {J. Sharma and A. Angelucci and M. Sur},
-    journal = {Nature},
-    pages = {841--847},
-    volume = {404},
-    year = {2000},
-} 
-
-@Article{Sharpe-64,
-  author =       "W. F. Sharpe",
-  title =        "Capital Asset Prices: {A} Theory of Market Equilibrium
-                 under Conditions of Risk",
-  journal =      "Journal of Finance",
-  volume =       "19",
-  pages =        "425--442",
-  year =         "1964",
-}
-
-@Article{Sharpe-66,
-  author =       "W. F. Sharpe",
-  title =        "Mutual Fund Performance",
-  journal =      "Journal of Business",
-  volume =       "39",
-  number =       "1",
-  pages =        "119--138",
-  year =         "1966",
-}
-
-@InProceedings{Shaw+Jebara-2007,
-  author =       "Blake Shaw and Tony Jebara",
-  booktitle =    aistats07,
-  title =        "Minimum Volume Embedding",
-  publisher =    "Omnipress",
-  date =         "March 21-24, 2007",
-  address =      "San Juan, Porto Rico",
-  year =         "2007",
-}
-
-@InProceedings{Shawe-Taylor+Cristianini+Kandola-2002,
-  author =       "J. Shawe-Taylor and N. Cristianini and J. Kandola",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "On the concentration of spectral properties",
-  publisher =    "{MIT} Press",
-  year =         "2002",
-}
-
-@InProceedings{Shawe-Taylor+Williams-2003,
-  author =       "J. Shawe-Taylor and C. K. I. Williams",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "The Stability of Kernel Principal Components Analysis
-                 and its Relation to the Process Eigenspectrum",
-  publisher =    "{MIT} Press",
-  year =         "2003",
-}
-
-@Article{Shawe-Taylor98,
-  author =       "John Shawe-Taylor and Peter Bartlett and Robert
-                 Williamson and Martin Anthony",
-  title =        "Structural Risk Minimization over Data-Dependent
-                 Hierarchies",
-  journal =      "IEEE Transactions on Information Theory",
-  volume =       "44",
-  number =       "5",
-  pages =        "1926--1940",
-  year =         "1998",
-}
-
-@Article{Sherrington75,
-  author =       "D. Sherrington and S. Kirkpatrick",
-  title =        "Solvable Model of a Spin Glass",
-  journal =      prl,
-  volume =       "35",
-  pages =        "1792--1796",
-  year =         "1975",
-}
-
-@Article{Shi+Malik-2000,
-  author =       "Jianbo Shi and Jitendra Malik",
-  title =        "Normalized Cuts and Image Segmentation",
-  journal =      "IEEE Transactions on Pattern Analysis and Machine
-                 Intelligence (PAMI)",
-  year =         "2000",
-}
-
-@InProceedings{Shi+Malik-97,
-  author =       "J. Shi and J. Malik",
-  booktitle =    cvpr97,
-  title =        "Normalized cuts and image segmentation",
-  pages =        "731--737",
-  year =         "1997",
-}
-
-@InProceedings{Shimohara88,
-  author =       "K. Shimohara and T. Uchiyama and Y. Tokunaga",
-  booktitle =    icnn,
-  title =        "Back-Propagation Networks for Event-Driven Temporal
-                 Sequence Processing",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "665--672",
-  year =         "1988",
-}
-
-@InProceedings{Shimohata+al-1997,
-  author =       "Sayori Shimohata and Toshiyuki Sugio and Junji
-                 Nagata",
-  booktitle =    "Proceedings of the 35th Conference of the Association
-                 for Computational Linguistics",
-  title =        "Retrieving Collocations by Co-occurrences and Word
-                 Order Constraints",
-  address =      "Madrid",
-  pages =        "476--481",
-  year =         "1997",
-}
-
-@inproceedings{shin:1991,
-    title = {The Pi-Sigma Network: An Efficient Higher-Order Neural Network for
-        Pattern Classification and Function Approximation},
-    author = {Yoan Shin and Joydeep Ghosh},
-    crossref = {IJCNN:1991},
-}
-@proceedings{IJCNN:1991,
-    title = {International Joint Conference on Neural Networks ({IJCNN})},
-    booktitle = ijcnn,
-    year = {1991},
-    address = {Seattle, Washington, USA},
-}
-
-@article{ShmulevichI2002,
-	author = {Ilya Shmulevich and Wei Zhang},
-	journal = {Bioinformatics},
-	number = {4},
-	pages = {555--565},
-	title = {Binary analysis and optimization-based normalization of gene expression data},
-	volume = {18},
-	year = {2002}
-}
-
-@Article{short81optimal,
-  author =       "R. D. Short and K. Fukunaga",
-  title =        "The optimal distance measure for nearest neighbor
-                 classification",
-  journal =      "IEEE Transactions on Information Theory",
-  volume =       "27",
-  pages =        "622--627",
-  year =         "1981",
-}
-
-@InProceedings{ShrikiO2001,
-  author =       "Oren Shriki and Haim Sompolinsky and Daniel D. Lee",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "An Information Maximization Approach to Overcomplete
-                 and Recurrent Representations",
-  publisher =    "{MIT} Press",
-  pages =        "933--938",
-  year =         "2001",
-}
-
-@InProceedings{ShrikiO2001-small,
-  author =       "Oren Shriki and Haim Sompolinsky and Daniel D. Lee",
-  booktitle =    "NIPS 13",
-  title =        "An Information Maximization Approach to Overcomplete
-                 and Recurrent Representations",
-  year =         "2001",
-}
-
-@Article{Shumway82,
-  author =       "R. H. Shumway and D. S. Stoffer",
-  title =        "An approach to time series smoothing and forecasting
-                 using the {EM} algorithm",
-  journal =      "Journal of Time Series Analysis",
-  volume =       "3",
-  number =       "4",
-  pages =        "253--264",
-  year =         "1982",
-}
-
-@Article{Shumway91,
-  author =       "R. H. Shumway and D. S. Stoffer",
-  title =        "Dynamic linear models with switching",
-  journal =      "J. Amer. Stat. Assoc.",
-  volume =       "86",
-  pages =        "763--769",
-  year =         "1991",
-}
-
-@Article{Sichel91,
-  author =       "D. E. Sichel",
-  title =        "Business cycle duration dependence: a parametric
-                 approach",
-  journal =      "Review of Economics and Statistics",
-  volume =       "71",
-  pages =        "245--260",
-  year =         "1991",
-}
-
-@TechReport{Siegelmann92,
-  author =       "H. T. Siegelmann and E. D. Sontag",
-  title =        "Neural Networks with Real Weighs: Analog Computational
-                 Complexity",
-  number =       "SYCON-92-05",
-  institution =  "Rutgers Center for System and Control",
-  address =      "New Brunswick, NJ",
-  month =        sep,
-  year =         "1992",
-}
-
-@InProceedings{Sietsma88,
-  author =       "J. Sietsma and R. J. F. Dow",
-  booktitle =    icnn,
-  title =        "Neural Net Pruning---Why and How",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "325--333",
-  year =         "1988",
-}
-
-@InProceedings{silver95,
-  author =       "Daniel L. Silver and Robert E. Mercer",
-  booktitle =    "Proceedings of the INNS World Congress on Neural
-                 Networks",
-  title =        "Toward a Model of Consolidation: The Retention and
-                 Transfer of Neural Net Task Knowledge",
-  volume =       "3",
-  address =      "Washington, DC",
-  pages =        "164--169",
-  month =        jul,
-  year =         "1995",
-}
-
-@Article{silver96,
-  author =       "Daniel L. Silver and Robert E. Mercer",
-  title =        "The Parallel Transfer of Task Knowledge Using Dynamic
-                 Learning Rates Based on a Measure of Relatedness",
-  journal =      "Connection Science, Special issue on Transfer in
-                 Inductive Systems",
-  volume =       "8",
-  number =       "2",
-  pages =        "277--294",
-  year =         "1996",
-}
-
-@TechReport{silver97,
-  author =       "Daniel L. Silver and Robert E. Mercer and Gilbert A.
-                 Hurwitz",
-  title =        "The Functional Transfer of Knowledge for Coronary
-                 Artery Disease Diagnosis",
-  number =       "513",
-  institution =  "Department of Computer Science, University of Western
-                 Ontario",
-  month =        jan,
-  year =         "1997",
-}
-
-@InCollection{Silverman-encyc86,
-  author =       "B. W. Silverman",
-  editor =       "N. L. Johnson and S. Kotz",
-  booktitle =    "Encyclopaedia of Statistical Sciences",
-  title =        "Penalized Likelihood",
-  volume =       "6",
-  publisher =    "Wiley, New York",
-  pages =        "664--667",
-  year =         "1986",
-}
-
-@Book{Silverman86,
-  author =       "Bernard W. Silverman",
-  title =        "Density Estimation for Statistics and Data Analysis",
-  publisher =    "Chapman and Hall",
-  address =      "London",
-  year =         "1986",
-}
-
-@InProceedings{Silverman88,
-  author =       "R. H. Silverman and A. S. Noetzel",
-  editor =      nips87ed,
-  booktitle =    nips87,
-  title =        "Time-Sequential Self-Organization of Hierarchical
-                 Neural Networks",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "709--714",
-  year =         "1988",
-}
-
-@InProceedings{simard-03,
-  author =       "D. Simard and P. Y. Steinkraus and J. C. Platt",
-  booktitle =    ICDAR03,
-  title =        "Best Practices for Convolutional Neural Networks",
-  year =         "2003",
-  isbn =         {0-7695-1960-1},
-  pages =        {958},
-  publisher =    {IEEE Computer Society},
-  address =      {Washington, DC, USA},
-  doi =          "http://doi.ieeecomputersociety.org/10.1109/ICDAR.2003.1227801",
-}
-
-@InProceedings{Simard89,
-  author =       "P. Y. Simard and M. B. Ottaway and D. H. Ballard",
-  editor =       "D. Touretzky and G. Hinton and T. Sejnowski",
-  booktitle =    cmss88,
-  title =        "Analysis of Recurrent Backpropagation",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Pittsburg 1988",
-  pages =        "103--112",
-  year =         "1989",
-}
-
-@InProceedings{Simard92,
-  author =       "Patrice Simard and Bernard Victorri and Yann LeCun
-                 and John Denker",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Tangent Prop - {A} formalism for specifying selected
-                 invariances in an adaptive network",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "895--903",
-  year =         "1992",
-}
-
-@InProceedings{Simard93,
-  author =       "P. Y. Simard and Y. {LeCun} and J. Denker",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Efficient pattern recognition using a new
-                 transformation distance",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  pages =        "50--58",
-  year =         "1993",
-}
-
-@Article{Simard98,
-  author =       "P. Y. Simard and Y. A. {LeCun} and J. S. Denker and B.
-                 Victorri",
-  title =        "Transformation Invariance in Pattern Recognition ---
-                 Tangent Distance and Tangent Propagation",
-  journal =      "Lecture Notes in Computer Science",
-  volume =       "1524",
-  year =         "1998",
-  CODEN =        "LNCSD9",
-  ISSN =         "0302-9743",
-  bibdate =      "Tue Jan 5 08:21:58 1999",
-  acknowledgement = ack-nhfb,
-  OPTpages =     "239--??",
-}
-
-@InProceedings{Simard-nips92,
-  author =       "P. Simard and Y. {LeCun}",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Reverse {TDNN}: An Architecture for Trajectory
-                 Generation",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "579--588",
-  year =         "1992",
-}
-
-@PhdThesis{Simard-PhD,
-  author =       "P. Y. Simard",
-  title =        "Learning State Space Dynamics in Recurrent Networks",
-  school =       "University of Rochester",
-  address =      "Rochester, NY",
-  year =         "1991",
-  note =         "Tech. Rep. 383",
-}
-
-@Article{Simic90,
-  author =       "P. D. Simic",
-  title =        "Statistical Mechanics As the Underlying Theory of
-                 ``Elastic'' and ``Neural'' Optimizations",
-  journal =      network,
-  volume =       "1",
-  pages =        "89--103",
-  year =         "1990",
-}
-
-@article{Simoncelli+al-1992,
-    author = "Eero P. Simoncelli and William T. Freeman and Edward H. Adelson and David J. Heeger", 
-    title = "Shiftable Multi-scale Transforms", 
-    journal = "IEEE Transactions on Informations Theory", 
-    volume = "38", 
-    number = "2", 
-    year = "1992", 
-    publisher = "The IEEE Computer Society", 
-}
-
-@InProceedings{Simoncelli97,
-  author =       "E. P. Simoncelli",
-  booktitle =    "Proc. 31st Asilomar Conference on Signals, Systems and
-                 Computers",
-  title =        "Statistical Models for Images: Compression,
-                 Restoration and Synthesis",
-  publisher =    "IEEE",
-  year =         "1997",
-}
-
-@InProceedings{Simoncelli99,
-  author =       "E. P. Simoncelli",
-  booktitle =    "Proc. SPIE,44th annual meeting",
-  title =        "Modeling the Joint Statistics of Images in the Wavelet
-                 Domain",
-  volume =       "3813",
-  publisher =    "SPIE",
-  year =         "1999",
-}
-
-@Article{Sinex+Geisler83,
-  author =       "D. G. Sinex and C. D. Geisler",
-  title =        "Response of auditory nerve fibers to consonant-vowel
-                 syllables",
-  journal =      jasa,
-  volume =       "73",
-  number =       "2",
-  pages =        "602--615",
-  year =         "1983",
-}
-
-@Article{Singer,
-  author =       "A. Singer",
-  title =        "Implementations of Artificial Neural Networks on the
-                 Connection Machine",
-  journal =      "Parallel Computing",
-  volume =       "14",
-  pages =        "305--315",
-  year =         "1990",
-  OPTnote =      "",
-}
-
-@InProceedings{Singer-1990,
-  author =       "Alexander Singer",
-  booktitle =    "Proceedings of the International Neural Networks
-                 Conference",
-  title =        "Exploiting the Inherent Parallelism of Artificial
-                 Neural Networks to Achieve 1300 Million Interconnects
-                 per Second",
-  pages =        "656--660",
-  year =         "1990",
-}
-
-@InProceedings{singer00leveraged,
-  author =       "Y. Singer",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "Leveraged vector machines",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "610--616",
-  year =         "2000",
-}
-
-@InProceedings{Singer96,
-  author =       "Y. Singer",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Adaptive Mixtures of Probabilistic Transducers",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@Article{Singer97,
-  author =       "Y. Singer",
-  title =        "Adaptive Mixtures of Probabilistic Transducers",
-  journal =      "Neural Computation",
-  volume =       "9",
-  number =       "8",
-  year =         "1997",
-}
-
-@InProceedings{singer:1996:nips,
-  author =       "Y. Singer",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Adaptive Mixtures of Probabilistic Transducers",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@InProceedings{Singh92,
-  author =       "S. P. Singh",
-  booktitle =    "Proceedings of the 10th National Conference on
-                 Artificial Intelligence",
-  title =        "Reinforcement learning with a hierarchy of abstract
-                 models",
-  publisher =    "MIT/AAAI Press",
-  pages =        "202--207",
-  year =         "1992",
-}
-
-@InProceedings{SinkkonenJ2002,
-  author =       "Janne Sinkkonen and Samuel Kaski and Janne
-                 Nikkil{\"{a}}",
-  booktitle =    ECML02,
-  title =        "Discriminative Clustering: Optimal Contingency Tables
-                 by Learning Metrics",
-  publisher =    "Springer-Verlag",
-  address =      "London, UK",
-  pages =        "418--430",
-  year =         "2002",
-  ISBN =         "3-540-44036-4",
-}
-
-@TechReport{Sirat90,
-  author =       "J.-A. Sirat and J.-P. Nadal",
-  title =        "Neural Trees: {A} New Tool for Classification",
-  type =         "Preprint",
-  institution =  "Laboratoires d'Electronique Philips",
-  address =      "Limeil-Bre\'vannes, France",
-  year =         "1990",
-}
-
-@InProceedings{SiroshJ1994,
-  author =       "Joseph Sirosh and Risto Miikkulainen",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Ocular Dominance and Patterned Lateral Connections in
-                 a Self-Organizing Model of the Primary Visual Cortex",
-  publisher =    "Morgan Kaufmann",
-  pages =        "109--116",
-  year =         "1994",
-}
-
-@InProceedings{SiroshJ1994-small,
-  author =       "J. Sirosh and R. Miikkulainen",
-  booktitle =    "NIPS 6",
-  title =        "Ocular Dominance and Patterned Lateral Connections in
-                 a Self-Organizing Model of the Primary Visual Cortex",
-  year =         "1994",
-}
-
-@InProceedings{Sivilotti87,
-  author =       "M. A. Sivilotti and M. A. Mahowald and C. A. Mead",
-  editor =       "P. Losleben",
-  booktitle =    "Advanced Research in VLSI: Proceedings of the 1987
-                 Stanford Conference",
-  title =        "Real-Time Visual Computations Using Analog {CMOS}
-                 Processing Arrays",
-  publisher =    "MIT Press, Cambridge",
-  pages =        "295--312",
-  year =         "1987",
-}
-
-@TechReport{Sjoberg92,
-  author =       "Jonas Si{\"o}berg and Lennart Ljung",
-  title =        "Overtraining, Regularization, and Searching for
-                 Minimum in Neural Networks",
-  institution =  "Link{\"o}ping University",
-  address =      "S-581 83 Link{\"o}ping, Sweden",
-  year =         "1992",
-}
-
-@article{Sjoberg95,
-  title={{Overtraining, regularization and searching for a minimum, with application to neural networks}},
-  author={Sj{\"o}berg, J. and Ljung, L.},
-  journal={International Journal of Control},
-  volume={62},
-  number={6},
-  pages={1391--1407},
-  year={1995},
-  publisher={Taylor \& Francis}
-}
-
-@Article{Skinner1958,
-  author =       "Burrhus F. Skinner",
-  title =        "Reinforcement Today",
-  journal =      "American Psychologist",
-  volume =       "13",
-  pages =        "94--99",
-  year =         "1958",
-}
-
-@PhdThesis{Small1980,
-  author =       "Steven L. Small",
-  title =        "Word Expert Parsing: {A} Theory of Distributed
-                 Word-Based Natural Language Understanding",
-  school =       "University of Maryland",
-  year =         "1980",
-}
-
-@Article{smilde97,
-  author =       "A. K. Smilde",
-  title =        "Comments on multilinear {PLS}",
-  journal =      "Journal of Chemometrics",
-  volume =       "11",
-  pages =        "367--377",
-  year =         "1997",
-}
-
-@Article{Smith+Waterman81,
-  author =       "T. F. Smith and W. S. Waterman",
-  title =        "Identification of common molecular subsequences",
-  journal =      "Journal of Molecular Biology",
-  volume =       "147",
-  pages =        "195--197",
-  year =         "1981",
-}
-
-@Article{Smith95,
-  author =       "S. P. Smith",
-  title =        "Differentiation of the Cholesky algorithm",
-  journal =      "Journal of Computational and Graphical Statistics",
-  volume =       "4",
-  pages =        "134--147",
-  year =         "1995",
-}
-
-@InProceedings{smola00sparsegreedy,
-  author =       "A. J. Smola and B. Sch{\"o}lkopf",
-  editor =       "P. Langley",
-  booktitle =    "International Conference on Machine Learning",
-  title =        "Sparse greedy matrix approximation for machine
-                 learning",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Francisco",
-  pages =        "911--918",
-  year =         "2000",
-}
-
-@InProceedings{Smola2000sparsegreedy,
-  author =       "A. J. Smola and P. Bartlett",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "Sparse Greedy {G}aussian Process Regression",
-  year =         "2001",
-}
-
-@InProceedings{Smola99semiparametricSVM,
-  author =       "A. J. Smola and T. Friess and B. {Sch\"olkopf}",
-  editor =       NIPS11ed,
-  booktitle =    NIPS11,
-  title =        "Semiparametric Support Vector and Linear Programming
-                 Machines",
-  publisher =    "MIT Press",
-  pages =        "585--591",
-  year =         "1999",
-  OPTaddress =   "Cambridge, MA",
-  OPTannote =    "",
-  OPTcrossref =  "",
-  OPTkey =       "",
-  OPTmonth =     "",
-  OPTnote =      "",
-  OPTnumber =    "",
-  OPTorganization = "",
-  OPTseries =    "",
-}
-
-@InCollection{Smolensky86,
-  author =       "Paul Smolensky",
-  editor =       "D. E. Rumelhart and J. L. McClelland",
-  booktitle =    pdp,
-  title =        "Information Processing in Dynamical Systems:
-                 Foundations of Harmony Theory",
-  chapter =      "6",
-  volume =       "1",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  pages =        "194--281",
-  year =         "1986",
-}
-
-@Article{Smyth94,
-  author =       "P. Smyth",
-  title =        {Hidden Markov models for fault detection in dynamic
-                 systems},
-  journal =      "Pattern Recognition",
-  volume =       "27",
-  number =       "1",
-  pages =        "149--164",
-  year =         "1994",
-}
-
-@Article{Smyth97,
-  author =       "P. Smyth and D. Heckerman and M. I. Jordan",
-  title =        {Probabilistic independence networks for hidden Markov
-                 probability models},
-  journal =      "Neural Computation",
-  volume =       "9",
-  number =       "2",
-  pages =        "227--269",
-  year =         "1997",
-}
-
-@InProceedings{Smyth97-nips,
-  author =       "P. Smyth",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        {Clustering sequences with hidden Markov models},
-  publisher =    "MIT Press",
-  year =         "1997",
-}
-
-@Article{Smyth98,
-  author =       "P. Smyth",
-  title =        {Belief Networks, Hidden Markov Models, and Markov
-                 Random Fields: a Unifying View},
-  journal =      "Pattern Recognition Letters",
-  year =         "1998",
-}
-
-@TechReport{Snapp+Venkatesh-1998,
-  author =       "Robert R. Snapp and Santosh S. Venkatesh",
-  title =        "Asymptotic derivation of the finite-sample risk of the
-                 k nearest neighbor classifier",
-  number =       "UVM-CS-1998-0101",
-  institution =  "Department of Computer Science, University of
-                 Vermont",
-  year =         "1998",
-}
-
-@InCollection{SNE-nips15,
-  author =       "G. E. Hinton and S. Roweis",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Stochastic Neighbor Embedding",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2003",
-}
-
-@InProceedings{Snow+al-2006,
-  author =       "Rion Snow and Daniel Jurafsky and Andrew Y. Ng",
-  booktitle =    "Proceedings of COLING/ACL 2006",
-  title =        "Semantic taxonomy induction from heterogenous
-                 evidence",
-  year =         "2006",
-}
-
-@book{SocietyNeuro-2006,
-    author = "{Society for Neuroscience}",
-    title = "Brain Facts: A Primer on the Brain and Nervous System",
-    year = 2006,
-    edition = "Fifth",
-    note = "{http://sfn.org}",
-}
-
-@Article{Soffer86,
-  author =       "B. H. Soffer and G. J. Dunning and Y. Owechko and E.
-                 Marom",
-  title =        "Associative Holographic Memory with Feedback Using
-                 Phase-Conjugate Mirrors",
-  journal =      optlett,
-  volume =       "11",
-  pages =        "118--120",
-  year =         "1986",
-}
-
-@Article{Sola94,
-  author =       "M. Sola and J. Driffill",
-  title =        "Testing the term structure of interest rates using a
-                 stationary vector autoregression with regime
-                 switching",
-  journal =      "Journal of Economic Dynamics and Control",
-  volume =       "18",
-  pages =        "601--628",
-  year =         "1994",
-}
-
-@Article{Solla88,
-  author =       "S. A. Solla and E. Levin and M. Fleisher",
-  title =        "Accelerated Learning in Layered Neural Networks",
-  journal =      cs,
-  volume =       "2",
-  pages =        "625--639",
-  year =         "1988",
-}
-
-@InProceedings{Solla89,
-  author =       "S. A. Solla",
-  editor =       "L. Personnaz and G. Dreyfus",
-  booktitle =    "Neural Networks from Models to Applications",
-  title =        "Learning and Generalization in Layered Neural
-                 Networks: The Contiguity Problem",
-  publisher =    "I.D.S.E.T., Paris",
-  address =      "Paris 1988",
-  pages =        "168--177",
-  year =         "1989",
-}
-
-@Article{Solomonoff64,
-  author =       "Ray J. Solomonoff",
-  title =        "A formal theory of inductive inference",
-  journal =      "Information and Control",
-  volume =       "7",
-  pages =        "1--22, 224--254",
-  year =         "1964",
-}
-
-@Article{Sompolinsky86,
-  author =       "H. Sompolinsky and I. Kanter",
-  title =        "Temporal Association in Asymmetric Neural Networks",
-  journal =      prl,
-  volume =       "57",
-  pages =        "2861--2864",
-  year =         "1986",
-}
-
-@InProceedings{Sompolinsky87,
-  author =       "H. Sompolinsky",
-  editor =       "J. L. van Hemmen and I. Morgenstern",
-  booktitle =    "Heidelberg Colloquium on Glassy Dynamics",
-  title =        "The Theory of Neural Networks: The Hebb Rules and
-                 Beyond",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Heidelberg 1986",
-  pages =        "485--527",
-  year =         "1987",
-}
-
-@Article{Sompolinsky88,
-  author =       "H. Sompolinsky and A. Crisanti and H. J. Sommers",
-  title =        "Chaos in Random Neural Networks",
-  journal =      prl,
-  volume =       "61",
-  pages =        "259--262",
-  year =         "1988",
-}
-
-@Article{Sondik73,
-  author =       "E. J. Sondik",
-  title =        "The optimal control of partially observable Markov
-                 processes over the finite horizon",
-  journal =      "Operations Research",
-  volume =       "11",
-  pages =        "1071--1088",
-  year =         "1973",
-}
-
-@Article{Sondik78,
-  author =       "E. J. Sondik",
-  title =        "The optimal control of partially observable Markov
-                 processes over the infinite horizon: discounted case",
-  journal =      "Operations Research",
-  volume =       "26",
-  pages =        "282--304",
-  year =         "1978",
-}
-
-@misc{Song+al-2008a,
-    author = {Yangqiu Song and Feiping Nie and Changshui Zhang},
-    title = {Semi-Supervised Sub-Manifold Discriminant Analysis},
-    note = {Pattern Recognition Letter},
-    year = 2008,
-}
-
-@article{Song+al-2008b,
-    author = {Yangqiu Song and Feiping Nie and Changshui Zhang and Shiming Xiang},
-    title = {A Unified Framework for Semi-Supervised Dimensionality Reduction},
-    journal = {Pattern Recognition},
-    volume = 41,
-    number = 9,
-    pages = {2789--2799},
-    year = 2008,
-}
-
-@incollection{Song+al-2008c,
-    title = {Colored Maximum Variance Unfolding},
-    author = {Le Song and Alex Smola and Karsten Borgwardt and Arthur Gretton},
-    editor =       NIPS20ed,
-    booktitle =    NIPS20,
-    publisher = {MIT Press},
-    address = {Cambridge, MA},
-    pages = {1385--1392},
-    year = {2008}
-}
-
-@Article{Sontag-cs89,
-  author =       "E. D. Sontag and H. J. Sussman",
-  title =        "Backpropagation Can Give Rise to Spurious Local Minima
-                 Even for Networks without Hidden Layers",
-  journal =      "Complex Systems",
-  volume =       "3",
-  pages =        "91--106",
-  year =         "1989",
-}
-
-@InProceedings{Sontag-ijcnn89,
-  author =       "E. D. Sontag and H. J. Sussman",
-  booktitle =    ijcnn,
-  title =        "Backpropagation Separates when Perceptrons Do",
-  publisher =    "IEEE Press",
-  address =      "Washington DC",
-  year =         "1989",
-  OPTpages =     "639--642",
-}
-
-@TechReport{sontag92t1,
-  author =       "E. D. Sontag",
-  title =        "Systems Combining Linearity and Saturations and
-                 Relations to Neural Networks",
-  number =       "SYCON--92--01",
-  institution =  "Rutgers Center for Systems and Control",
-  year =         "1992",
-}
-
-@Article{Soukoulis83,
-  author =       "C. M. Soukoulis and K. Levin and G. S. Grest",
-  title =        "Irreversibility and Metastability in Spin-Glasses.
-                 {I}. Ising Model",
-  journal =      prB,
-  volume =       "28",
-  pages =        "1495--1509",
-  year =         "1983",
-}
-
-@Article{Specht90,
-  author =       "D. F. Specht",
-  title =        "Probabilistic Neural Networks",
-  journal =      nn,
-  volume =       "3",
-  pages =        "109--118",
-  year =         "1990",
-}
-
-@Article{Specht91,
-  author =       "D. F. Specht",
-  title =        "A General Regression Neural Network",
-  journal =      "IEEE Trans. Neural Networks",
-  volume =       "2",
-  number =       "6",
-  pages =        "568--576",
-  month =        nov,
-  year =         "1991",
-}
-
-@Article{Spiegelhalter93,
-  author =       "D. J. Spiegelhalter and A. P. Dawid and S. L.
-                 Lauritzen and R. G. Cowell",
-  title =        "Bayesian Analysis in Expert Systems",
-  journal =      "Statistical Science",
-  volume =       "8",
-  pages =        "219--283",
-  year =         "1993",
-}
-
-@InProceedings{Spielman-96,
-  author =       "D. Spielman and S. Teng",
-  booktitle =    "Proceedings of the 37th Annual Symposium on
-                 Foundations of Computer Science",
-  title =        "Spectral partitioning works: planar graphs and finite
-                 element meshes",
-  year =         "1996",
-}
-
-@TechReport{Spielman-96b,
-  author =       "Daniel A. Spielman and Shang-Hua Teng",
-  title =        "Spectral Partitioning Works: Planar Graphs and Finite
-                 Element Meshes",
-  number =       "UCB CSD-96-898",
-  institution =  "U.C. Berkeley",
-  year =         "1996",
-}
-
-@ARTICLE{spirkovska:1990,
-    author={Spirkovska, L. and Reid, M. B.},
-    title={Connectivity Strategies for Higher-Order Neural Networks Applied to
-        Pattern Recognition},
-    journal=ijcnn,
-    year={1990},
-    month={June},
-    volume={1},
-    number={},
-    pages={21--26},
-    keywords={computerised pattern recognition, neural netsconnection
-        strategies, higher-order neural networks, interconnections, pattern
-            recognition, pattern-recognition, regional connectivity},
-    doi={10.1109/IJCNN.1990.137538},
-    ISSN={}, 
-}
-
-
-@Book{Spirtes-book93,
-  author =       "P. Spirtes and C. Glymour and R. Scheines",
-  title =        "Causation, Prediction, and Search",
-  publisher =    "Springer-Verlag, New York",
-  year =         "1993",
-}
-
-@Article{Spirtes-Glymour91,
-  author =       "P. Spirtes and C. Glymour",
-  title =        "An algorithm for fast recovery of sparse causal
-                 graphs",
-  journal =      "Social Science Computing Reviews",
-  volume =       "9",
-  number =       "1",
-  pages =        "62--72",
-  year =         "1991",
-}
-
-@InProceedings{Srebro-Jaakkola,
-  author =       "N. Srebro and T. Jaakkola",
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  title =        "Weighted Low-Rank Approximations",
-  address =      "Washington, D.C.",
-  pages =        "720--727",
-  year =         "2003",
-}
-
-@Book{SSL-Book-2006,
-  author =       "Olivier Chapelle and Bernhard. Sch{\"{o}}lkopf and Alexander Zien",
-  title =        "Semi-Supervised Learning",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2006",
-}
-
-@Article{Steels2003,
-  author =       "L. Steels",
-  title =        "Evolving grounded communication for robots",
-  journal =      "Trends in Cognitive Science",
-  volume =       "7",
-  number =       "7",
-  pages =        "308--312",
-  month =        jul,
-  year =         "2003",
-  URL =          "http://www.csl.sony.fr/downloads/papers/2003/steels-03c.pdf",
-}
-
-@Article{Steinbuch61,
-  author =       "K. Steinbuch",
-  title =        "Die Lernmatrix",
-  journal =      kyb,
-  volume =       "1",
-  pages =        "36--45",
-  year =         "1961",
-}
-
-@Article{SteinhausH1956,
-  author = 	 {Hugo Steinhaus},
-  title = 	 {Sur la division des corps mat\'eriels en parties},
-  journal = 	 {Bulletin L'Acad\'emie Polonaise des Sciences},
-  year = 	 {1956},
-  volume = 	 {4},
-  pages = 	 {801-804},
-}
-
-@InCollection{Stevens+Blumstein81,
-  author =       "K. N. Stevens and S. E. Blumstein",
-  editor =       "P. D. Eimas and J. L. Miller",
-  booktitle =    "Perspectives on the study of speech",
-  title =        "The search for invariant acoustic correlates of
-                 phonetic features",
-  publisher =    "Lawrence Erlbaum ass.",
-  pages =        "1--38",
-  year =         "1981",
-}
-
-@InCollection{Stevens75,
-  author =       "K. N. Stevens",
-  editor =       "G. Fant and M. A. Tatham",
-  booktitle =    "Auditory analysis and perception of speech",
-  title =        "The potential role of properties detectors in the
-                 perception of consonants",
-  publisher =    "Academic Press, London",
-  pages =        "303--330",
-  year =         "1975",
-}
-
-@Article{Stevenson90,
-  author =       "M. Stevenson and R. Winter and B. Widrow",
-  title =        "Sensitivity of Feedforward Neural Networks to Weight
-                 Errors",
-  journal =      "IEEE. Trans. on Neural Networks",
-  volume =       "1",
-  number =       "1",
-  pages =        "71--80",
-  month =        mar,
-  year =         "1990",
-  keywords =     "neural network fault tolerance robustness reliability
-                 adaline weight errors",
-}
-
-@Book{Stewart-1998,
-  author =       "G. W. Stewart",
-  title =        "Matrix Algorithms, Volume {I}: Basic Decompositions",
-  publisher =    "SIAM",
-  address =      "Philadelphia",
-  year =         "1998",
-}
-
-@Book{Stewart73,
-  author =       "G. W. Stewart",
-  title =        "Introduction to matrix computations",
-  publisher =    "Academic Press",
-  year =         "1973",
-}
-
-@InProceedings{Stinchcombe+White89,
-  author =       "M. Stinchcombe and H. White",
-  booktitle =    ijcnn,
-  title =        "Universal approximation using feedforward networks
-                 with non-sigmoid hidden layer activation function",
-  publisher =    "IEEE",
-  address =      "Washington DC",
-  pages =        "613--617",
-  year =         "1989",
-}
-
-@TechReport{Stokbro90,
-  author =       "K. Stokbro and D. K. Umberger and J. A. Hertz",
-  title =        "Exploiting Neurons with Localized Receptive Fields to
-                 Learn Chaos",
-  type =         "Preprint",
-  number =       "90/28 S",
-  institution =  "Nordita",
-  address =      "Copenhagen, Denmark",
-  year =         "1990",
-}
-
-@InProceedings{Stolcke-ICSLP02,
-  author =       "A. Stolcke",
-  booktitle =    "Proceedings of the International Conference on
-                 Statistical Language Processing",
-  title =        "{SRILM} - An extensible language modeling toolkit",
-  address =      "Denver, Colorado",
-  year =         "2002",
-}
-
-@InProceedings{Stolcke93,
-  author =       "A. Stolcke and S. Omohundro",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Hidden {Markov} model induction by {Bayesian} model
-                 merging",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "11--18",
-  year =         "1993",
-}
-
-@TechReport{Stolcke94a,
-  author =       "A. Stolcke and S. M. Omohundro",
-  title =        "Best-first Model Merging for Hidden {Markov} Model
-                 Induction",
-  number =       "TR-94-003",
-  institution =  "International Computer Science Institute",
-  address =      "Berkeley, CA",
-  month =        jan,
-  year =         "1994",
-}
-
-@TechReport{Stolcke94b,
-  author =       "A. Stolcke and J. Segal",
-  title =        "Precise n-gram Probabilities from Stochastic
-                 Context-free Grammars",
-  number =       "TR-94-007",
-  institution =  "International Computer Science Institute",
-  address =      "Berkeley, CA",
-  month =        jan,
-  year =         "1994",
-}
-
-@Article{Stone-80,
-  author =       "C. J. Stone",
-  title =        "Optimal rates of convergence for nonparametric
-                 estimators",
-  journal =      "Annals of Statistics",
-  volume =       "8",
-  number =       "6",
-  pages =        "1348--1360",
-  year =         "1980",
-}
-
-@Article{Stormo82,
-  author =       "G. D. Stormo and T. D. Schneider and L. Gold and A.
-                 Ehrenfeucht",
-  title =        "Use of the perceptron algorithm to distinguish
-                 translational initiation sites in {\it {E}. {Coli}}",
-  journal =      "Nucleic Acid Research",
-  volume =       "10",
-  pages =        "2997--3010",
-  year =         "1982",
-}
-
-@InProceedings{Stornetta88,
-  author =       "W. S. Stornetta and T. Hogg and B. A. Huberman",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "A Dynamical Approach to Temporal Pattern Processing",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "750--759",
-  year =         "1988",
-}
-
-@Book{Strang80,
-  author =       "G. Strang",
-  title =        "Linear Algebra and Its Applications",
-  publisher =    "Academic Press",
-  address =      "New York",
-  year =         "1980",
-}
-
-@PhdThesis{Suaudeau94,
-  author =       "N. Suaudeau",
-  title =        "Un mod\`ele probabiliste pour int\'egrer la dimension
-                 temporelle dans un syst\`eme de reconnaissance
-                 automatique de la parole",
-  school =       "Universit\'e de Rennes I",
-  address =      "France",
-  year =         "1994",
-}
-
-@Article{suddarth91,
-  author =       "Steven C. Suddarth and Alistair D. C. Holden",
-  title =        "Symbolic-neural systems and the use of hints for
-                 developing complex systems",
-  journal =      "Int. J. Man-Mach. Stud.",
-  volume =       "35",
-  number =       "3",
-  publisher =    "Academic Press Ltd.",
-  address =      "London, UK",
-  pages =        "291--311",
-  year =         "1991",
-}
-
-@article{Sudderth-2007,
- author = {Erik B. Sudderth and Antonio Torralba and William T. Freeman and Alan S. Willsky},
- title = {Describing visual scenes using transformed objects and parts},
- journal = {Int. Journal of Computer Vision},
- volume = 77,
- publisher = {Springer},
- pages = "291--330",
- year = "2007",
-}
-
-@article{Sugiyama-2007,
-    author = {Masashi Sugiyama},
-    title = {Dimensionality reduction of multimodal labeled data by local {F}isher discriminant analysis},
-    journal = jmlr,
-    year = {2007},
-    volume = {8},
-    pages = {1027--1061}
-}
-
-@InProceedings{Sun-ijcnn90,
-  author =       "G. Z. Sun and H. H. Chen and Y. C. Lee and C. L
-                 Giles",
-  booktitle =    ijcnn,
-  title =        "Recurrent Neural Networks, Hidden {Markov} Models and
-                 Stochastic Grammars",
-  volume =       "I",
-  address =      "San Diego CA",
-  pages =        "729--734",
-  year =         "1990",
-}
-
-@Book{Sundararajan+Saratchandran-1998,
-  author =       "N. Sundararajan and P. Saratchandran",
-  title =        "Parallel Architectures for Artificial Neural Networks:
-                 Paradigms and Implementations",
-  publisher =    "IEEE Computer Society Press",
-  address =      "Los Alamitos, CA",
-  year =         "1998",
-  ISBN =         "0-8186-8399-6",
-}
-
-@InProceedings{Sutskever+Hinton-2007,
-  author =       "Ilya Sutskever and Geoffrey E. Hinton",
-  booktitle =    aistats07,
-  title =        "Learning Multilevel Distributed Representations for
-                 High-Dimensional Sequences",
-  publisher =    "Omnipress",
-  date =         "March 21-24, 2007",
-  address =      "San Juan, Porto Rico",
-  year =         "2007",
-}
-
-@Article{Sutskever+Hinton-2008,
-  author =       "Ilya Sutskever and Geoffrey E. Hinton",
-  title =        "Deep Narrow Sigmoid Belief Networks are Universal
-                 Approximators",
-  journal =      "Neural Computation",
-  volume =       "to appear",
-  year =         "2008",
-}
-
-@Book{Sutton+Barto-98,
-  author =       "Richard Sutton and Andrew Barto",
-  title =        "Reinforcement Learning: An Introduction",
-  publisher =    "MIT Press",
-  year =         "1998",
-}
-
-@InCollection{sutton06introduction,
-  author =       "Charles Sutton and Andrew McCallum",
-  editor =       "Lise Getoor and Ben Taskar",
-  booktitle =    "Introduction to Statistical Relational Learning",
-  title =        "An Introduction to Conditional Random Fields for
-                 Relational Learning",
-  publisher =    "MIT Press",
-  year =         "2006",
-  note =         "",
-  URL =          "publications/crf-tutorial.pdf",
-  tags =         "recent",
-}
-
-@PhdThesis{Sutton84,
-  author =       "R. S. Sutton",
-  title =        "Temporal Credit Assignment in Reinforcement Learning",
-  school =       "University of Massachusetts",
-  address =      "Amherst",
-  year =         "1984",
-}
-
-@Article{Sutton88,
-  author =       "R. S. Sutton",
-  title =        "Learning to Predict by the Methods of Temporal
-                 Differences",
-  journal =      mlearn,
-  volume =       "3",
-  pages =        "9--44",
-  year =         "1988",
-}
-
-@InCollection{Sutton91,
-  author =       "R. S. Sutton and A. G. Barto",
-  editor =       "M. Gabriel and J. W. Moore",
-  booktitle =    "Learning and Computational Neuroscience",
-  title =        "Time Derivative Models of Pavlovian Reinforcement",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1991",
-}
-
-@InProceedings{Sutton95,
-  author =       "R. S. Sutton",
-  booktitle =    "Proceedings of the 12th International Conference on
-                 Machine Learning",
-  title =        "{TD} models: modeling the world at a mixture of time
-                 scales",
-  publisher =    "Morgan Kaufmann",
-  year =         "1995",
-}
-
-@InProceedings{Szu86,
-  author =       "H. Szu",
-  editor =       "J. S. Denker",
-  booktitle =    snowbird,
-  title =        "Fast Simulated Annealing",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Snowbird 1986",
-  pages =        "420--425",
-  year =         "1986",
-}
-
-@InProceedings{Szummer+Jaakkola-2002,
-  author =       "M. Szummer and T. Jaakkola",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Partially labeled classification with Markov random
-                 walks",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  year =         "2002",
-}
-
-
-@article{Takabatake+al-2007,
-    author = {Hiroki Takabatake and Manabu Kotani and Seiichi Ozawa},
-    title = {Feature extraction by supervised independent component analysis based on category information},
-    journal = {Electrical Engineering in Japan},
-    volume = 161,
-    number = 2,
-    pages = {25--32},
-    year = 2007,
-}
-
-@InProceedings{TakahashiN2001,
-  author =       "Naoto Takahashi and Minoru Motoki and Yoshio Shimazu
-                 and Yoichi Tomiura and Tory Hitaka",
-  booktitle =    "Proceedings of the Second Workshop on Natural Language
-                 Processing and Neural Networks",
-  title =        "{PP}-attachment Ambiguity Resolution Using a Neural
-                 Network with Modified {FGREP} Method",
-  address =      "Tokyo",
-  year =         "2001",
-}
-
-@InProceedings{Takens81,
-  author =       "F. Takens",
-  editor =       "D. A. Rand and L.-S. Young",
-  booktitle =    "Dynamical Systems and Turbulenc",
-  title =        "Detecting Strange Attractors In Turbulence",
-  volume =       "898",
-  publisher =    "Springer-Verlag, Berlin",
-  address =      "Warwick 1980",
-  pages =        "366--381",
-  year =         "1981",
-  series =       "Lecture Notes in Mathematics",
-}
-
-@Article{Takeuchi79,
-  author =       "A. Takeuchi and S. Amari",
-  title =        "Formation of Topographic Maps and Columnar
-                 Microstructures in Nerve Fields",
-  journal =      biocyb,
-  volume =       "35",
-  pages =        "63--72",
-  year =         "1979",
-}
-
-@InCollection{Tam+Perkel89,
-  author =       "Tam D. C. and Perkel D. H.",
-  editor =       "Hawkins R. D. and Bower G. H.",
-  booktitle =    "Computational Models of Learning in Simple Neural
-                 Systems",
-  title =        "Quantitative modeling of synaptic plasticity",
-  publisher =    "Academic Press",
-  pages =        "1--30",
-  year =         "1989",
-}
-
-@Article{Tank86,
-  author =       "D. W. Tank and J. J. Hopfield",
-  title =        "Simple ``Neural'' Optimization Networks: An {A}/{D}
-                 Converter, Signal Decision Circuit, and a Linear
-                 Programming Circuit",
-  journal =      ieeetcas,
-  volume =       "33",
-  pages =        "533--541",
-  year =         "1986",
-}
-
-@Article{Tank87a,
-  author =       "D. W. Tank and J. J. Hopfield",
-  title =        "Neural Computation by Time Compression",
-  journal =      PNAS,
-  volume =       "84",
-  pages =        "1896--1900",
-  year =         "1987",
-}
-
-@InProceedings{Tank87b,
-  author =       "D. W. Tank and J. J. Hopfield",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Concentrating Information in Time: Analog Neural
-                 Networks with Applications to Speech Recognition
-                 Problems",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "455--468",
-  year =         "1987",
-}
-
-@Book{Tanner1993,
-  author =       "M. Tanner",
-  title =        "Tools for statistical inference: Methods for
-                 exploration of posterior distributions and likelihood
-                 functions",
-  publisher =    "Springer",
-  address =      "New York",
-  year =         "1993",
-}
-
-@Article{Tappert90,
-  author =       "C. Tappert and C. Suen and T. Wakahara",
-  title =        "The state of the art in on-line handwriting
-                 recognition",
-  journal =      ieeetpami,
-  volume =       "8",
-  number =       "12",
-  pages =        "787--808",
-  year =         "1990",
-}
-
-@InCollection{Taylor+2007,
-  author =       "Graham Taylor and Geoffrey E. Hinton and Sam Roweis",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Modeling Human Motion Using Binary Latent Variables",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "1345--1352",
-  year =         "2007",
-}
-
-%%FRED: I deprecate this one as the years in the tag is not the one for the publication but the conference!
-@InProceedings{Taylor2006,
-  author =       "Graham Taylor and Geoffrey E. Hinton and Sam Roweis",
-  editor =       NIPS19ed,
-  booktitle =    NIPS19,
-  title =        "Modeling Human Motion Using Binary Latent Variables",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "1345--1352",
-  year =         "2007",
-}
-
-@InProceedings{Taylor2006-small,
-  author =       "Graham Taylor and Geoffrey E. Hinton and Sam Roweis",
-  booktitle =    "NIPS 20",
-  title =        "Modeling Human Motion Using Binary Latent Variables",
-  year =         "2006",
-}
-
-@InProceedings{TaylorHintonICML2009,
-  author =    {Graham Taylor and Geoffrey Hinton},
-  title =     {Factored Conditional Restricted {Boltzmann} Machines for Modeling Motion Style},
-  booktitle = {Proceedings of the 26th International Conference on Machine Learning (ICML'09)},
-  pages =     {1025--1032},
-  year =      2009,
-  editor =    {L\'{e}on Bottou and Michael Littman},
-  address =   {Montreal},
-  month =     {June},
-  publisher = {Omnipress}
-}
-
-@InProceedings{Taylor56,
-  author =       "W. K. Taylor",
-  editor =       "C. Cherry",
-  booktitle =    "Information Theory",
-  title =        "Electrical Simulation of Some Nervous System
-                 Functional Activities",
-  publisher =    "Butterworths, London",
-  address =      "London 1985",
-  pages =        "314--328",
-  year =         "1956",
-}
-
-@InProceedings{Tebelskis91,
-  author =       "J. Tebelskis and A. Waibel and B. Petek and O.
-                 Schmidbauer",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "Continuous Speech Recognition Using Linked Predictive
-                 Networks",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Denver, CO",
-  pages =        "199--205",
-  year =         "1991",
-}
-
-@Article{Teh-2003,
-  author =       "{Yee Wye} Teh and Max Welling and Simon Osindero and
-                 Geoffrey E. Hinton",
-  title =        "Energy-Based Models for Sparse Overcomplete
-                 Representations",
-  journal =      jmlr,
-  volume =       "4",
-  pages =        "1235--1260",
-  year =         "2003",
-}
-
-@InProceedings{Teh-Roweis-2003,
-  author =       "Y. Whye Teh and S. Roweis",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Automatic Alignment of Local Representations",
-  publisher =    "{MIT} Press",
-  year =         "2003",
-}
-
-@article{TehY2006,
-title=          "Hierarchical {D}irichlet Processes",
-author=         "Y. W. Teh and M. I. Jordan and M. J. Beal and D. M. Blei",
-journal=        "Journal of the American Statistical Association",
-volume=         "101",
-number=         "476",
-pages=          "1566-1581",
-year=           "2006"
-}
-
-@Article{tenenbaum00separating,
-  author =       "Joshua B. Tenenbaum and William T. Freeman",
-  title =        "Separating Style and Content with Bilinear Models",
-  journal =      "Neural Computation",
-  volume =       "12",
-  number =       "6",
-  pages =        "1247--1283",
-  year =         "2000",
-}
-
-@Article{Tenenbaum2000-isomap,
-  author =       "Joshua Tenenbaum and Vin {de Silva} and John C. Langford",
-  title =        "A Global Geometric Framework for Nonlinear
-                 Dimensionality Reduction",
-  journal =      "Science",
-  volume =       "290",
-  number =       "5500",
-  pages =        "2319--2323",
-  month =        dec,
-  year =         "2000",
-}
-
-@Article{Terrell+Scott-1992,
-  author =       "G. R. Terrell and D. W. Scott",
-  title =        "Variable Kernel Density Estimation",
-  journal =      "Annals of Statistics",
-  volume =       "20",
-  pages =        "1236--1265",
-  year =         "1992",
-}
-
-@Article{Tesauro86,
-  author =       "G. Tesauro",
-  title =        "Simple Neural Models of Classical Conditioning",
-  journal =      biocyb,
-  volume =       "55",
-  pages =        "187--200",
-  year =         "1986",
-}
-
-@Article{Tesauro88a,
-  author =       "G. Tesauro and B. Janssens",
-  title =        "Scaling Relationships in Back-Propagation Learning",
-  journal =      cs,
-  volume =       "2",
-  pages =        "39--44",
-  year =         "1988",
-}
-
-@InProceedings{Tesauro88b,
-  author =       "G. Tesauro and T. J. Sejnowski",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "A ``Neural'' Network That Learns to Play Backgammon",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "442--456",
-  year =         "1988",
-}
-
-@Article{Tesauro90,
-  author =       "G. Tesauro",
-  title =        "Neurogammon Wins Computer Olympiad",
-  journal =      nc,
-  volume =       "1",
-  pages =        "321--323",
-  year =         "1990",
-}
-
-@Article{Tesauro92,
-  author =       "G. Tesauro",
-  title =        "Practical issues in temporal difference learning",
-  journal =      "Machine Learning",
-  volume =       "8",
-  pages =        "257--277",
-  year =         "1992",
-}
-
-@Article{tesauro:1994:nc,
-  author =       "G. Tesauro",
-  title =        "{TD-Gammon}, a Self-Teaching Backgammon Program,
-                 Achieves Master-Level Play",
-  journal =      nc,
-  volume =       "6",
-  number =       "2",
-  pages =        "215--219",
-  year =         "1994",
-}
-
-@Article{Thakoor87,
-  author =       "A. P. Thakoor and A. Moopenn and J. Lambe and S. K.
-                 Khanna",
-  title =        "Electronic Hardware Implementations of Neural
-                 Networks",
-  journal =      applopt,
-  volume =       "26",
-  pages =        "5085--5092",
-  year =         "1987",
-}
-
-@InProceedings{THastie95,
-  author =       "Trevor Hastie and Patrice Simard and Eduard
-                 Sackinger",
-  editor =       NIPS7ed,
-  booktitle =    NIPS7,
-  title =        "Learning Prototype Models for Tangent Distance",
-  publisher =    "MIT Press",
-  pages =        "999--1006",
-  year =         "1995",
-}
-
-@Article{THastie98,
-  author =       "T. Hastie and P. Simard",
-  title =        "Metrics and Models for Handwritten Character
-                 Recognition",
-  journal =      "Statistical Science",
-  volume =       "13",
-  number =       "1",
-  pages =        "54--65",
-  month =        jan,
-  year =         "1998",
-  URL =          "citeseer.ist.psu.edu/hastie97metrics.html",
-}
-
-@Book{thrun+pratt-book-1998,
-  editor =       "Sebastian Thrun and Lorien Y. Pratt",
-  title =        "Learning to Learn",
-  publisher =    "Kluwer Academic",
-  year =         "1998",
-}
-
-@InProceedings{Thrun1995,
-  author =       "T. Thrun and T. Mitchell",
-  booktitle =    "Proceedings of the 14th International Joint Conference
-                 on Artificial Intelligence (IJCAI)",
-  title =        "Learning One More Thing",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  month =        aug,
-  year =         "1995",
-}
-
-@Misc{thrun95,
-  author =       "S. Thrun and J. O'Sullivan",
-  title =        "Clustering learning tasks and the selective cross-task
-                 transfer of knowledge",
-  year =         "1995",
-  text =         "Technical Report CMU-CS-95-209, Carnegie Mellon
-                 University, School of Computer Science",
-}
-
-@TechReport{thrun95a,
-  author =       "Sebastian Thrun",
-  title =        "Lifelong Learning: {A} Case Study",
-  number =       "CMU-CS-95-208",
-  institution =  "School of Computer Science, Carnegie Mellon
-                 University",
-  address =      "Pittsburgh, PA 15213",
-  month =        nov,
-  year =         "1995",
-}
-
-@InProceedings{thrun95b,
-  author =       "Sebastian Thrun and Tom M. Mitchell",
-  booktitle =    "Proceedings of IJCAI-95",
-  title =        "Learning One More Thing",
-  organization = "IJCAI",
-  address =      "Montreal, Canada",
-  year =         "1995",
-}
-
-@InProceedings{Thrun96a,
-  author =       "S. Thrun",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Is Learning the $n$-th Thing Any Easier Than Learning
-                 the First?",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "640--646",
-  year =         "1996",
-}
-
-@BOOK{Thrun96b,
-  AUTHOR         = {S. Thrun},
-  YEAR           = {1996},
-  TITLE          = {Explanation-Based Neural Network Learning: A Lifelong 
-                    Learning Approach},
-  PUBLISHER      = {Kluwer Academic Publishers},
-  ADDRESS        = {Boston, MA}
-}
-
-@Article{Tibshirani95,
-  author =       "Robert J. Tibshirani",
-  title =        "Regression shrinkage and selection via the lasso",
-  journal =      "Journal of the Royal Statistical Society B",
-  volume =       "58",
-  pages =        "267--288",
-  year =         "1995",
-}
-
-@Article{Ticknor87,
-  author =       "A. J. Ticknor and H. Barrett",
-  title =        "Optical Implementations of {Boltzmann} Machines",
-  journal =      opteng,
-  volume =       "26",
-  pages =        "16--21",
-  year =         "1987",
-}
-
-@Book{Tikhonov+Arsenin77,
-  author =       "A. N. Tikhonov and V. Y. Arsenin",
-  title =        "Solutions of Ill-posed Problems",
-  publisher =    "W. H. Winston",
-  address =      "Washington D.C.",
-  year =         "1977",
-}
-
-@InProceedings{tipping00relevance,
-  author =       "M. E. Tipping",
-  editor =       NIPS12ed,
-  booktitle =    NIPS12,
-  title =        "The Relevance Vector Machine",
-  publisher =    "MIT Press",
-  pages =        "652--658",
-  year =         "2000",
-  OPTaddress =   "Cambridge, MA",
-}
-
-@Article{tipping99mixtures,
-  author =       "M. E. Tipping and C. M. Bishop",
-  title =        "Mixtures of Probabilistic Principal Component
-                 Analysers",
-  journal =      "Neural Computation",
-  volume =       "11",
-  number =       "2",
-  pages =        "443--482",
-  year =         "1999",
-  URL =          "citeseer.nj.nec.com/tipping98mixtures.html",
-}
-
-@InProceedings{Tishby89,
-  author =       "N. Tishby and E. Levin and S. A. Solla",
-  booktitle =    ijcnn,
-  title =        "Consistent Inference of Probabilities in Layered
-                 Networks: Predictions and Generalization",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "403--410",
-  year =         "1989",
-}
-
-@InProceedings{Titov+Henderson-2007,
-  author =       "Ivan Titov and James Henderson",
-  booktitle =    "Proc. 45th Meeting of Association for Computational
-                 Linguistics (ACL'07)",
-  title =        "Constituent Parsing with Incremental Sigmoid Belief
-                 Networks",
-  address =      "Prague, Czech Republic",
-  pages =        "632--639",
-  year =         "2007",
-  URL =          {http://aclweb.org/anthology-new/P/P07/P07-1080.pdf},
-}
-
-@InProceedings{ToMa00,
-  author =       "Kristina Toutanova and Christopher D. Manning",
-  booktitle =    "EMNLP/VLC 2000",
-  title =        "Enriching the Knowledge Sources Used in a Maximum
-                 Entropy Part-of-Speech Tagger",
-  pages =        "63--70",
-  year =         "2000",
-}
-
-@InProceedings{Tomita82,
-  author =       "M. Tomita",
-  booktitle =    "Proceedings of the Fourth Annual Cognitive Science
-                 Conference",
-  title =        "Dynamic Construction of Finite-state Automata from
-                 Examples Using Hill-Climbing",
-  address =      "Ann Arbor, MI",
-  pages =        "105--108",
-  year =         "1982",
-}
-
-@Book{Tong83,
-  author =       "H. Tong",
-  title =        "Threshold Models in Nonlinear Time Series Analysis",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  year =         "1983",
-}
-
-@InProceedings{TongKoller2000,
-  author =       "S. Tong and D. Koller",
-  booktitle =    "Proceedings of the 17th National Conference on
-                 Artificial Intelligence (AAAI)",
-  title =        "Restricted Bayes Optimal Classifiers",
-  address =      "Austin, Texas",
-  pages =        "658--664",
-  year =         "2000",
-}
-
-@Article{Torgerson52,
-  author =       "W. Torgerson",
-  title =        "Multidimensional scaling, 1: Theory and method",
-  journal =      "Psychometrika",
-  volume =       "17",
-  pages =        "401--419",
-  year =         "1952",
-}
-
-@inproceedings{Torralba+Fergus+Weiss-2008,
- author = {Antonio Torralba and Robert Fergus and Yair Weiss},
- title = {Small codes and large databases for recognition},
- booktitle = cvpr08,
- pages = "1-8",
- year = 2008,
-}
-
-@incollection{Torresani+Lee-2007,
-    title = {Large Margin Component Analysis},
-    author = {Lorenzo Torresani and Kuang-Chih Lee},
-    booktitle = NIPS19,
-    editor = NIPS19ed,
-    publisher = {MIT Press},
-    address = {Cambridge, MA},
-    pages = {1385--1392},
-    year = {2007}
-}
-
-@InProceedings{Torresen+al-1995,
-  author =       "J. Torresen and S. Mori and H. Nakashima and S. Tomita
-                 and O. Landsverk",
-  booktitle =    "Proceedings of the Fourth International Conference on
-                 Artificial Neural Networks",
-  title =        "Exploiting multiple degrees of {BP} parallelism on the
-                 highly parallel computer {AP1000}",
-  address =      "Cambridge, UK",
-  pages =        "483--488",
-  year =         "1995",
-}
-
-@InProceedings{Torresen+al-1995b,
-  author =       "J. Torresen and S. Tomita and O. Landsverk",
-  booktitle =    "World Congress on Neural Networks",
-  title =        "The relation of Weight Update Frequency to Convergence
-                 of {BP}",
-  address =      "Washington D.C., USA",
-  year =         "1995",
-}
-
-@Article{Torresen-1997,
-  author =       "Jim Torresen",
-  title =        "The Convergence of Backpropagation Trained Neural
-                 Networks for Various Weight Update Frequencies",
-  journal =      "International Journal of Neural Systems",
-  volume =       "8",
-  number =       "3",
-  year =         "1997",
-}
-
-@Article{Toulouse86,
-  author =       "G. Toulouse and S. Dehaene and J.-P. Changeux",
-  title =        "Spin Glass Model of Learning by Selection",
-  journal =      PNAS,
-  volume =       "83",
-  pages =        "1695--1698",
-  year =         "1986",
-}
-
-@Article{Touretzky89,
-  author =       "D. S. Touretzky and D. A. Pomerleau",
-  title =        "What's Hidden in the Hidden Layers?",
-  journal =      BYTE,
-  pages =        "227--233",
-  month =        aug,
-  year =         "1989",
-}
-
-@InProceedings{ToutanovaKMS03,
-  author =       "Kristina Toutanova and Dan Klein and Christopher D.
-                 Manning and Yoram Singer",
-  booktitle =    "HLT-NAACL",
-  title =        "Feature-Rich Part-of-Speech Tagging with a Cyclic
-                 Dependency Network.",
-  year =         "2003",
-  bibsource =    "DBLP, http://dblp.uni-trier.de",
-  ee =           "http://acl.ldc.upenn.edu/N/N03/N03-1033.pdf",
-}
-
-@InProceedings{Towell-nips92,
-  author =       "G. G. Towell and J. W. Shawlik",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Interpretation of Artificial Neural Networks: Mapping
-                 Knowledge-Based Neural Networks into Rules",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo CA",
-  year =         "1992",
-}
-
-@InProceedings{towell93,
-  author =       "G. G. Towell and J. W. Shavlik",
-  editor =       NIPS4ed,
-  booktitle =    NIPS4,
-  title =        "Interpretation of Artificial Neural Networks: Mapping
-                 Knowledge-Based Neural Networks into rules",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Meteo, CA",
-  pages =        "977--984",
-  year =         "1992",
-}
-
-@InProceedings{Towell-aaai90,
-  author =       "G. G. Towell and J. W. Shawlick and M. O. Noordewier",
-  booktitle =    "Proceedings of the Eighth National Conference on
-                 Artificial Intelligence (AAAI-90)",
-  title =        "Refinement of Approximate Domain Theories by
-                 Knowledge-Based Neural Networks",
-  pages =        "861--866",
-  year =         "1990",
-  OPTnote =      "",
-}
-
-@TechReport{TR:Breiman.arcing,
-  author =       "Leo Breiman",
-  title =        "Bias, variance, and Arcing classifiers",
-  number =       "460",
-  institution =  "Statistics Department, University of California at
-                 Berkeley",
-  year =         "1996",
-}
-
-@TechReport{TR:Breiman:edge,
-  author =       "Leo Breiman",
-  title =        "Arcing the edge",
-  number =       "486",
-  institution =  "Statistics Department, University of California at
-                 Berkeley",
-  year =         "1997",
-}
-
-@TechReport{TR:Breiman:gametheorie,
-  author =       "Leo Breiman",
-  title =        "Prediction games and arcing classifiers",
-  number =       "504",
-  institution =  "Statistics Department, University of California at
-                 Berkeley",
-  year =         "1997",
-}
-
-@TechReport{TR:Friedman+Hastie+Tibshirani:AdaBoost-theory,
-  author =       "J. Friedman and T. Hastie and R. Tibshirani",
-  title =        "Additive Logistic Regression: a Statistical View of
-                 Boosting",
-  institution =  "August 1998, Department of Statistics, Stanford
-                 University",
-  year =         "1998",
-}
-
-@TechReport{TR:Tibshirani:bias+var,
-  author =       "R. Tibshirani",
-  title =        "Bias, Variance and Prediction Error for Classification
-                 Rules",
-  institution =  "Departement od Statistics, University of Toronto",
-  year =         "1996",
-}
-
-@Article{Traven91,
-  author =       "H. G. C. Traven",
-  title =        "A neural network approach to statistical pattern
-                 classification by semiparametric estimation of
-                 probability density functions",
-  journal =      ieeetrnn,
-  volume =       "2",
-  number =       "3",
-  pages =        "366--377",
-  year =         "1991",
-}
-
-@InCollection{TreHolAhm93,
-  author =       "V. Tresp and J. Hollatz and S. Ahmad",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Network structuring and training using rule-based
-                 knowledge",
-  publisher =    "Morgan Kaufman Publishers",
-  address =      "San Mateo, CA",
-  year =         "1993",
-}
-
-@InProceedings{Tresp-nips93,
-  author =       "V. Tresp and J. Hollatz and S. Ahmad",
-  editor =       NIPS5ed,
-  booktitle =    NIPS5,
-  title =        "Network Structuring and Training Using Rule-based
-                 Knowledge",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1993",
-}
-
-@Article{tresp2001,
-  author =       "V. Tresp",
-  title =        "Scaling Kernel-Based Systems to Large Data Sets",
-  journal =      "Data Mining and Knowledge Discovery",
-  volume =       "5",
-  number =       "3",
-  pages =        "197--211",
-  year =         "2001",
-}
-
-@InCollection{Tresp94,
-  author =       "V. Tresp and S. Ahmad and R. Neuneier",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Training neural networks with deficient data",
-  publisher =    "Morgan Kaufman Publishers",
-  address =      "San Mateo, CA",
-  pages =        "128--135",
-  year =         "1994",
-}
-
-@Article{TRNN:Tsoi94,
-  author =       "A. C. Tsoi and A. Back",
-  title =        "Locally Recurrent Globally Feedforward Networks, {A}
-                 Critical Review of Architectures",
-  journal =      "IEEE Transactions on Neural Networks",
-  volume =       "5",
-  number =       "2",
-  pages =        "229--239",
-  year =         "1994",
-}
-
-@InProceedings{Tseng-1998,
-  author =       "Yuen-Hsien Tseng",
-  booktitle =    "SIGIR '98: Proceedings of the 21st Annual
-                 International ACM SIGIR Conference on Research and
-                 Development in Information Retrieval, August 24-28
-                 1998, Melbourne, Australia",
-  title =        "Multilingual Keyword Extraction for Term Suggestion",
-  publisher =    "ACM",
-  pages =        "377--378",
-  year =         "1998",
-}
-
-@Article{TsochantaridisI2005,
-  author =       "Ioannis Tsochantaridis and Thorsten Joachims and
-                 Thomas Hofmann and Yasemin Altun",
-  title =        "Large Margin Methods for Structured and Interdependent
-                 Output Variables",
-  journal =      "J. Mach. Learn. Res.",
-  volume =       "6",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA, USA",
-  pages =        "1453--1484",
-  year =         "2005",
-  ISSN =         "1533-7928",
-}
-
-@Article{Tsodyks88,
-  author =       "M. V. Tsodyks and M. V. Feigel'man",
-  title =        "The Enhanced Storage Capacity in Neural Networks with
-                 Low Activity Level",
-  journal =      eul,
-  volume =       "6",
-  pages =        "101--105",
-  year =         "1988",
-}
-
-@InProceedings{Tsoi+Pearson91,
-  author =       "A. C. Tsoi and R. A. Pearson",
-  editor =       NIPS3ed,
-  booktitle =    NIPS3,
-  title =        "Comparison of three classification techniques: {CART},
-                 {C4}.5, and multi-layer perceptron",
-  publisher =    "Morgan Kaufmann",
-  address =      "Denver, CO",
-  pages =        "",
-  year =         "1991",
-}
-
-@Book{TSP93,
-  editor =       "A. Weigend and N. Gershenfeld",
-  title =        "Time Series Prediction: Forecasting the future and
-                 understanding the past",
-  publisher =    "Addison-Wesley",
-  year =         "1993",
-}
-
-@InProceedings{Tsuda99,
-  author =       "K. Tsuda",
-  booktitle =    "ICANN'99",
-  title =        "Optimal Hyperplane Classifier based on Entropy Number
-                 Bound",
-  pages =        "419--424",
-  year =         "1999",
-}
-
-@PhdThesis{Turian07thesis,
-  author =       "Joseph Turian",
-  title =        "Constituent Parsing by Classification",
-  school =       "New York University",
-  year =         "2007",
-}
-
-@Article{tzanetakis+cook:2002,
-  author =       "George Tzanetakis and Perry Cook",
-  title =        "Musical Genre Classification of Audio Signals",
-  journal =      "IEEE Transactions on Speech and Audio Processing",
-  volume =       "10",
-  number =       "5",
-  pages =        "293--302",
-  month =        jul,
-  year =         "2002",
-}
-
-@Article{Uberbacher91,
-  author =       "E. C. Uberbacher and R. J. Mural",
-  title =        "Locating protein-coding regions in human {DNA}
-                 sequences by a multiple sensor-neural network
-                 approach",
-  journal =      "Proc. Natl. Acad. Sci. USA",
-  volume =       "88",
-  pages =        "11261--11265",
-  year =         "1991",
-}
-
-@Article{Uhrig91,
-  author =       "R. E. Uhrig",
-  title =        "Potential Applications of Neural Networks to the
-                 Operation of a Nuclear Power Plant",
-  journal =      "Nuclear Safety",
-  volume =       "32",
-  number =       "1",
-  year =         "1991",
-}
-
-@Article{Uhrig94,
-  author =       "R. E. Uhrig",
-  title =        "Artificial Neural Networks in Nuclear Power Plants",
-  journal =      "Nuclear News",
-  volume =       "37",
-  number =       "9",
-  pages =        "38",
-  year =         "1994",
-}
-
-@Article{Utgoff-2002,
-  author =       "Paul E. Utgoff and David J. Stracuzzi",
-  title =        "Many-Layered Learning",
-  journal =      "Neural Computation",
-  volume =       "14",
-  pages =        "2497--2539",
-  year =         "2002",
-}
-
-@Article{Valiant84,
-  author =       "L. G. Valiant",
-  title =        "A Theory of the Learnable",
-  journal =      "Communications of the ACM",
-  volume =       "27",
-  number =       "11",
-  pages =        "1134--1142",
-  year =         "1984",
-}
-
-@InProceedings{VandenBout88,
-  author =       "D. E. Van den Bout and T. K. Miller",
-  booktitle =    icnn,
-  title =        "A Travelling Salesman Objective Function That Works",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "299--303",
-  year =         "1988",
-}
-
-@Article{VandenBout89,
-  author =       "D. E. Van den Bout and T. K. Miller",
-  title =        "Improving the Performance of the Hopfield-Tank Neural
-                 Network Through Normalization and Annealing",
-  journal =      biocyb,
-  volume =       "62",
-  pages =        "129--139",
-  year =         "1989",
-}
-
-@Article{VanDerMaaten08,
-  author =       "Laurens {van der Maaten} and Geoffrey E. Hinton",
-  title =        {Visualizing Data using t-SNE},
-  journal =      jmlr,
-  year =         "2008",
-  keywords =     {dimension-reduction, locality, nearest-neighbors, spectral, visualization},
-  month =        {November},
-  pages =        {2579--2605},
-  url =          {http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf},
-  volume =       {9},
-}
-
-@Book{VanDerVaart+Wellner-1996,
-  author =       "A. W. {van der Vaart} and J. Wellner",
-  title =        "Weak Convergence and Empirical Processes with
-                 applications to Statistics",
-  publisher =    "Springer",
-  address =      "New York",
-  year =         "1996",
-}
-
-@Article{vanHemmen79,
-  author =       "J. L. van Hemmen and R. G. Palmer",
-  title =        "The Replica Method and a Solvable Spin Glass Model",
-  journal =      jpa,
-  volume =       "12",
-  pages =        "563--580",
-  year =         "1979",
-}
-
-@Article{vanHemmen86,
-  author =       "J. L. van Hemmen and R. K{\"u}hn",
-  title =        "Nonlinear Neural Networks",
-  journal =      prl,
-  volume =       "57",
-  pages =        "913--916",
-  year =         "1986",
-}
-
-@Article{vanHemmen90,
-  author =       "J. L. van Hemmen and L. B. Ioffe and R. K{\"u}hn and
-                 M. Vaas",
-  title =        "Increasing the Efficiency of a Neural Network through
-                 Unlearning",
-  journal =      physicaA,
-  volume =       "163",
-  pages =        "386--392",
-  year =         "1990",
-}
-
-% HUGO: Haven't found what A. stands for...
-@Article{VapnikV63,
-  author =       "Vladimir Vapnik and A. Lerner", 
-  title =        "Pattern Recognition using Generalized Portrait Method",
-  journal =      "Automation and Remote Control",
-  volume =       "24",
-  year =         "1963",
-}
-
-@Article{Vapnik71,
-  author =       "V. N. Vapnik and A. Y. Chervonenkis",
-  title =        "On the Uniform Convergence of Relative Frequencies of
-                 Events to Their Probabilities",
-  journal =      tprobapp,
-  volume =       "16",
-  pages =        "264--280",
-  year =         "1971",
-}
-
-@Book{Vapnik82,
-  author =       "V. N. Vapnik",
-  title =        "Estimation of Dependences Based on Empirical Data",
-  publisher =    "Springer-Verlag",
-  address =      "Berlin",
-  year =         "1982",
-}
-
-@Article{Vapnik93,
-  author =       "V. Vapnik and L. Bottou",
-  title =        "Local algorithms for pattern recognition and
-                 dependencies estimation",
-  journal =      nc,
-  volume =       "5",
-  number =       "6",
-  pages =        "893--909",
-  year =         "1993",
-}
-
-@Book{Vapnik95,
-  author =       "V. N. Vapnik",
-  title =        "The Nature of Statistical Learning Theory",
-  publisher =    "Springer",
-  address =      "New York",
-  year =         "1995",
-}
-
-@Book{Vapnik98,
-  author =       "Vladimir Vapnik",
-  title =        "Statistical Learning Theory",
-  publisher =    "Wiley, Lecture Notes in Economics and Mathematical
-                 Systems, volume 454",
-  year =         "1998",
-}
-
-@InCollection{variational99,
-  author =       "M. I. Jordan and Z. Ghahramani and T. Jaakkola and L.
-                 Saul",
-  editor =       "M. I. Jordan",
-  booktitle =    "Learning in Graphical Models",
-  title =        "An introduction to variational methods in graphical
-                 models",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "1999",
-}
-
-@InProceedings{Venka+PC-2004,
-  author =       "Shailaja Venkatsubramanyan and Jose Perez-Carballo",
-  booktitle =    "Second ACL Workshop on Multiword Expressions",
-  title =        "Multiword Expression Filtering for Building Knowledge
-                 Maps",
-  pages =        "40--47",
-  year =         "2004",
-}
-
-@InProceedings{Verbeek-2004,
-  author =       "Jakob J. Verbeek and Sam T. Roweis and Nikos Vlassis",
-  editor =       NIPS16ed,
-  booktitle =    NIPS16,
-  title =        "Non-linear {CCA} and {PCA} by Alignment of Local
-                 Models",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2004",
-  keywords =     "dimensionality reduction, spectral methods, mixture
-                 density, CCA, PCA",
-}
-
-@InProceedings{Veronis1990,
-  author =       "Jean Veronis and Nancy Ide",
-  booktitle =    "COLING'90",
-  title =        "Word Sense Disambiguation with Very Large Neural
-                 Networks Extracted from Machine Readable Dictionaries",
-  year =         "1990",
-}
-
-@Misc{Veronis98,
-  author =       "Jean Veronis",
-  title =        "A study of polysemy judgements and inter-annotator
-                 agreement",
-  year =         "1998",
-  URL =          "citeseer.nj.nec.com/veronis98study.html",
-  text =         "Veronis, J., 1998. A study of polysemy judgements and
-                 inter-annotator agreement. In Programme and advanced
-                 papers of the Senseval workshop. Herstmonceux Castle,
-                 England.",
-}
-
-@InProceedings{Vilalta+al-1997,
-  author =       "Ricardo Vilalta and Gunnar Blix and Larry Rendell",
-  booktitle =    ECML97,
-  title =        "Global Data Analysis and the Fragmentation Problem in
-                 Decision Tree Induction",
-  publisher =    "Springer-Verlag",
-  pages =        "312--327",
-  year =         "1997",
-}
-
-@InProceedings{Vincent-Bengio-2003-short,
-  author =       "Pascal Vincent and Yoshua Bengio",
-  booktitle =    NIPS15,
-  title =        "Manifold Parzen Windows",
-  publisher =    "MIT Press",
-  year =         "2003",
-}
-
-@TechReport{Vincent-TR1316-small,
-  author =       "P. Vincent and H. Larochelle and Y. Bengio and P.-A.
-                 Manzagol",
-  title =        "Extracting and Composing Robust Features with
-                 Denoising Autoencoders",
-  number =       "1316",
-  institution =  "Universit\'e de Montr\'eal, dept. IRO",
-  year =         "2008",
-}
-
-@Article{Vincent2001,
-  author =       "P. Vincent and Y. Bengio",
-  title =        "Kernel Matching Pursuit",
-  journal =      "Machine Learning",
-  volume =       "48",
-  number =       "",
-  pages =        "165--187",
-  year =         "2002",
-}
-
-@InProceedings{Vincent2002,
-  author =       "P. Vincent and Y. Bengio",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "{K}-Local Hyperplane and Convex Distance Nearest
-                 Neighbor Algorithms",
-  publisher =    "{MIT} Press",
-  address =      "Cambridge, MA",
-  pages =        "985--992",
-  year =         "2002",
-}
-
-@InProceedings{VincentPLarochelleH2008-small,
-  author =       "Pascal Vincent and Hugo Larochelle and Yoshua Bengio
-                 and Pierre-Antoine Manzagol",
-  booktitle =    "ICML 2008",
-  title =        "Extracting and Composing Robust Features with
-                 Denoising Autoencoders",
-  year =         "2008",
-}
-
-@InProceedings{VincentPLarochelleH2008-short,
-  author =       "Pascal Vincent and Hugo Larochelle and Yoshua Bengio
-                 and Pierre-Antoine Manzagol",
-  booktitle =    "Int. Conf. Mach. Learn.",
-  title =        "Extracting and Composing Robust Features with
-                 Denoising Autoencoders",
-  year =         "2008",
-  pages =        "1096--1103"
-}
-
-
-@InProceedings{vincent:icml08,
-   author =     "Pascal Vincent and Hugo Larochelle and Yoshua Bengio and {Pierre-Antoine Manzagol}",
-   title =      "Extracting and composing robust features with denoising autoencoders",
-   booktitle =  "Proceedings of the 25th Annual International Conference on Machine Learning (ICML 2008)",
-   location =   "Helsinki, Finland",
-   editor =     "Andrew McCallum and Sam Roweis",
-   publisher =  "Omnipress",
-   year =       "2008",
-   pages =      "1096--1103",
-}
-   %url =        "http://icml2008.cs.helsinki.fi/papers/592.pdf",
-
-@InProceedings{VincentPLarochelleH2008-very-small,
-  author =       "P. Vincent and H. Larochelle and Y. Bengio and P.-A.
-                 Manzagol",
-  booktitle =    "ICML 2008",
-  title =        "Extracting and Composing Robust Features with
-                 Denoising Autoencoders",
-  year =         "2008",
-}
-
-@Article{Viterbi67,
-  author =       "A. Viterbi",
-  title =        "Error bounds for convolutional codes and an
-                 asymptotically optimum decoding algorithm",
-  journal =      ieeeit,
-  pages =        "260--269",
-  year =         "1967",
-}
-
-@InProceedings{Vlachos-2002,
-  author =       "Michail Vlachos and Carlotta Domeniconi and Dimitrios
-                 Gunopulos and George Kollios and Nick Koudas",
-  booktitle =    "Proc. of 8th SIGKDD",
-  title =        "Non-Linear Dimensionality Reduction Techniques for
-                 Classification and Visualization",
-  address =      "Edmonton, Canada",
-  year =         "2002",
-  URL =          "citeseer.ist.psu.edu/573153.html",
-}
-
-@Article{vogl-88,
-  author =       "T. Vogl and J. Mangis and J. Rigler and W. Zink and D.
-                 Alkon",
-  title =        "accelerating convergence of the back-propagation
-                 method",
-  journal =      "Biological Cybernetics",
-  volume =       "59",
-  pages =        "257--263",
-  year =         "1988",
-}
-
-@Article{Vogl88,
-  author =       "T. P. Vogl and J. K. Mangis and A. K. Rigler and W. T.
-                 Zink and D. L. Alkon",
-  title =        "Accelerating the Convergence of the Back-Propagation
-                 Method",
-  journal =      biocyb,
-  volume =       "59",
-  pages =        "257--263",
-  year =         "1988",
-}
-
-@Book{Volterra,
-  author =       "V. Volterra",
-  title =        "Theory of Functionals and of Integrals and
-                 Integro-Differential Equations",
-  publisher =    "Dover",
-  address =      "New York",
-  year =         "1959",
-}
-
-@Article{vonderMalsburg73,
-  author =       "Ch. von der Malsburg",
-  title =        "Self-Organization of Orientation Sensitive Cells in
-                 the Striate Cortex",
-  journal =      kyb,
-  volume =       "14",
-  year =         "1973",
-}
-
-@Article{vonderMalsburg82,
-  author =       "Ch. von der Malsburg and J. D. Cowan",
-  title =        "Outline of a Theory for the Ontogenesis of
-                 Iso-Orientation Domains in Visual Cortex",
-  journal =      biocyb,
-  volume =       "45",
-  pages =        "49--56",
-  year =         "1982",
-}
-
-@InProceedings{vonLehman88,
-  author =       "A. von Lehman and E. G. Paek and P. F. Liao and A.
-                 Marrakchi and J. S. Patel",
-  booktitle =    icnn,
-  title =        "Factors Influencing Learning by Back-Propagation",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "335--341",
-  year =         "1988",
-}
-
-@Article{vonLuxurg07,
-  author =       "U. von Luxburg",
-  title =        "A Tutorial on Spectral Clustering.",
-  journal =      "Statistics and Computing",
-  volume = 	 "17(4)",
-  pages =        "395-416",
-  year =         "2007",
-}
-
-@InCollection{vonNeumann56,
-  author =       "J. von Neumann",
-  editor =       "C. E. Shannon and J. McCarthy",
-  booktitle =    "Automata Studies",
-  title =        "Probabilistic Logics and the Synthesis of Reliable
-                 Organisms from Unreliable Components",
-  publisher =    "Princeton University Press",
-  address =      "Princeton",
-  pages =        "43--98",
-  year =         "1956",
-}
-
-@Article{Wagner87,
-  author =       "K. Wagner and D. Psaltis",
-  title =        "Multilayer Optical Learning Networks",
-  journal =      applopt,
-  volume =       "26",
-  pages =        "5061--5076",
-  year =         "1987",
-}
-
-@InCollection{Wahba82,
-  author =       "G. Wahba",
-  editor =       "Gupta and Berger",
-  booktitle =    "Statistical Decision Theory and Related Topics III",
-  title =        "Constrained regularization for ill-posed linear
-                 operator equations, with applications in meteorology
-                 and medecine",
-  publisher =    "Academic Press",
-  year =         "1982",
-}
-
-@InProceedings{Wahba90,
-  author =       "G. Wahba",
-  booktitle =    "CBMS-NSF Regional Conference Series in Applied
-                 Mathematics",
-  title =        "Spline models for observational data",
-  volume =       "59",
-  publisher =    "Society for Industrial and Applied Mathematics
-                 (SIAM)",
-  address =      "Philadelphia, PA",
-  year =         "1990",
-}
-
-@Article{Waibel89a,
-  author =       "A. Waibel",
-  title =        "Modular Construction of Time-Delay Neural Networks for
-                 Speech Recognition",
-  journal =      nc,
-  volume =       "1",
-  pages =        "39--46",
-  year =         "1989",
-}
-
-@Article{Waibel89b,
-  author =       "A. Waibel and T. Hanazawa and G. E. Hinton and K.
-                 Shikano and K. Lang",
-  title =        "Phoneme Recognition Using Time-Delay Neural Networks",
-  journal =      ieeetassp,
-  volume =       "37",
-  pages =        "328--339",
-  year =         "1989",
-}
-
-@Article{Waibel89c,
-  author =       "A. Waibel and H Sawai and K. Shikano",
-  title =        "Modularity and Scaling in Large Phonemic Neural
-                 Networks",
-  journal =      ieeetassp,
-  volume =       "37",
-  pages =        "1888--1898",
-  year =         "1989",
-}
-
-@Article{Wallace+Boulton-1968,
-  author =       "C. S. Wallace and D. M. Boulton",
-  title =        "An information measure for classification",
-  journal =      "Computer Journal",
-  volume =       "11",
-  number =       "2",
-  pages =        "185--194",
-  year =         "1968",
-}
-
-@InCollection{Wan93,
-  author =       "Wan E. A.",
-  editor =       "A. S. Weigend and N. A. Gershenfeld",
-  booktitle =    "Time Series Prediction: Forecasting the Future and
-                 Understanding the Past",
-  title =        "Time series prediction by using a connectionist
-                 network with internal delay lines",
-  publisher =    "Addison-Wesley",
-  pages =        "195--217",
-  year =         "1993",
-}
-
-@InCollection{Wan93a,
-  author =       "E. A. Wan",
-  editor =       "A. Weigend and N. Gershenfeld",
-  booktitle =    "Predicting the future and understanding the past",
-  title =        "Time Series Prediction by Using a Connectionist
-                 Network with Internal Delay Lines",
-  publisher =    "Addison-Wesley",
-  address =      "Redwood City, CA",
-  pages =        "175--193",
-  year =         "1993",
-}
-
-@InProceedings{Wang-ijcnn91,
-  author =       "S. D. Wang and C. H. Hsu",
-  booktitle =    ijcnn,
-  title =        "Terminal Attractor Learning Algorithms for
-                 Backpropagation Neural Networks",
-  publisher =    "IEEE Press",
-  address =      "Singapore",
-  pages =        "183--189",
-  month =        nov,
-  year =         "1991",
-}
-
-@INPROCEEDINGS{WangC1994,
-    author = {Changfeng Wang and Santosh S. Venkatesh and J. Stephen Judd},
-    title = {Optimal stopping and effective machine complexity in learning},
-    editor = NIPS6ed,
-    booktitle = NIPS6,
-    year = {1994},
-    pages = {303--310},
-    publisher = {Morgan Kaufmann}
-}
-
-@inproceedings{wangetal08,
-author = "Wang, Q. and Lin, D. and Schuurmans, D.",
-title = "Semi-supervised convex training for dependency parsing",
-booktitle = "Proceedings of the Forty-sixth Annual Conference of the 
-Association for Computational Linguistics: Human Language Technologies (ACL)",
-year = 2008,
-note = "Acceptance rate 25\%; Wang a trainee"
-}
-
-@inproceedings{wangetal07,
-author = "Wang, T. and Lizotte, D. and Bowling, M. and Schuurmans, D.",
-title = "Stable dual dynamic programming",
-editor =       NIPS20ed,
-booktitle =    NIPS20,
-year = 2007,
-note = "Acceptance rate 22\%; Wang and Lizotte trainees"
-}
-
-
-@Misc{Wang02,
-  author =       "L. Wang and K. Luk Chan",
-  howpublished =    "6th kernel machines workshop, in conjunction with Neural Information Processing Systems (NIPS)",
-  title =        "Learning Kernel Parameters by using Class Separability
-                 Measure",
-  year =         "2002",
-  url =          "http://users.rsise.anu.edu.au/~wanglei/#Publication",
-}
-
-@Article{Wang89,
-  author =       "H. Wang and J. Wu and P. Tang",
-  title =        "Superfamily expands",
-  journal =      "Nature",
-  volume =       "337",
-  pages =        "514",
-  year =         "1989",
-}
-
-@InProceedings{WangHarper2002,
-  author =       "Wen Wang and Mary P. Harper",
-  booktitle =    "EMNLP '02: Proceedings of the ACL-02 conference on
-                 Empirical methods in natural language processing",
-  title =        "The Super{ARV} language model: investigating the
-                 effectiveness of tightly integrating multiple knowledge
-                 sources",
-  publisher =    "Association for Computational Linguistics",
-  address =      "Morristown, NJ, USA",
-  pages =        "238--247",
-  year =         "2002",
-}
-
-@Article{Warmuth95,
-  author =       "Sally Floyd and Manfred Warmuth",
-  title =        "Sample Compression, Learnability, and the
-                 Vapnik-Chervonenkis Dimension",
-  journal =      "Machine Learning",
-  volume =       "21",
-  number =       "3",
-  pages =        "269--304",
-  year =         "1995",
-}
-
-@Book{Wasserman-2004,
-  author =       "Larry Wasserman",
-  title =        "All of Statistics - A Concise Course in Statistical Inference",
-  publisher =    "Springer",
-  year =         "2004",
-}
-
-@PhdThesis{Watkins-PhD,
-  author =       "C. J. C. H. Watkins",
-  title =        "Learning from Delayed Rewards",
-  school =       "Cambridge University",
-  address =      "Cambridge, England",
-  year =         "1989",
-}
-
-@InProceedings{Watrous87,
-  author =       "R. L. Watrous",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "Learning Algorithms for Connectionist Networks:
-                 Applied Gradient Methods of Nonlinear Optimization",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "619--627",
-  year =         "1987",
-}
-
-@TechReport{Watrous89,
-  author =       "R. L. Watrous",
-  title =        "Context-modulated discrimination of similar vowels
-                 using second-order connectionist networks",
-  number =       "{CRG-TR}-89-5",
-  institution =  "University of Toronto",
-  year =         "1989",
-}
-
-@Article{Watrous-nc92,
-  author =       "R. L. Watrous and G. M. Kuhn",
-  title =        "Induction of Finite-State Languages Using Second-Order
-                 Recurrent Networks",
-  journal =      nc,
-  volume =       "4",
-  number =       "3",
-  pages =        "406--414",
-  year =         "1992",
-}
-
-@Article{Watson64,
-  author =       "G. S. Watson",
-  title =        "Smooth regression analysis",
-  journal =      "Sankhya - The Indian Journal of Statistics",
-  volume =       "26",
-  pages =        "359--372",
-  year =         "1964",
-}
-
-@inproceedings{Weber-2000,
- author = {Markus Weber and Max Welling and Pietro Perona},
- title = {Unsupervised Learning of Models for Recognition},
- booktitle = {Proc. 6th Europ. Conf. Comp. Vis., ECCV2000}, 
- address = {Dublin},
- year = 2000,
- pages     = {18-32},
- url       = {http://link.springer.de/link/service/series/0558/bibs/1842/18420018.htm},
-}
-
-@Book{Webster88,
-  editor =       "Webster",
-  title =        "Webster's Ninth New Collegiate Dictionary",
-  publisher =    "Merriam-Webster",
-  address =      "Springfield",
-  year =         "1988",
-}
-
-@Book{Wegener87,
-  author =       "Ingo Wegener",
-  title =        "The Complexity of Boolean Functions",
-  publisher =    "John Wiley \& Sons",
-  year =         "1987",
-}
-
-@InCollection{Weigend93,
-  author =       "N. A. Gershenfeld and A. S. Weigend",
-  editor =       "A. Weigend and N. Gershenfeld",
-  booktitle =    "Predicting the future and understanding the past",
-  title =        "The Future of Time Series: Learning and
-                 Understanding",
-  publisher =    "Addison-Wesley",
-  address =      "Redwood City, CA",
-  pages =        "1--70",
-  year =         "1993",
-}
-
-@Article{Weigend95,
-  author =       "A. S. Weigend and A. N. Srivastava",
-  title =        "Predicting Conditional Probability Distributions: {A}
-                 Connectionist Approach",
-  journal =      "International Journal of Neural Systems",
-  volume =       "6",
-  year =         "1995",
-}
-
-@InProceedings{Weinberger+Saul-06,
-  author =       "K. Q. Weinberger and L. K. Saul",
-  booktitle =    "Proceedings of the National Conference on Artificial
-                 Intelligence (AAAI)",
-  title =        "An Introduction to Nonlinear Dimensionality Reduction
-                 by Maximum Variance Unfolding",
-  address =      "Boston, MA",
-  year =         "2006",
-}
-
-@InProceedings{weinberger-learningkernel-04,
-  author =       "Kilian Q. Weinberger and Fei Sha and Lawrence K. Saul",
-  booktitle =    ICML04,
-  editor =       ICML04ed,
-  publisher =    ICML04publ,
-  title =        "Learning a kernel matrix for nonlinear dimensionality
-                 reduction",
-  address =      "Banff, Canada",
-  pages =        "839--846",
-  year =         "2004",
-}
-
-@InProceedings{Weinberger04a,
-  author =       "K. Q. Weinberger and L. K. Saul",
-  booktitle =    cvpr04,
-  title =        "Unsupervised Learning of Image Manifolds by
-                 Semidefinite Programming",
-  volume =       "2",
-  address =      "Washington D.C.",
-  pages =        "988--995",
-  year =         "2004",
-}
-
-@Article{weinberger95,
-  author =       "M. J. Weinberger and J. Rissanen and M. Feder",
-  title =        "A universal finite memory source",
-  journal =      "IEEE Transactions on Information Theory",
-  pages =        "656--664",
-  year =         "1983",
-}
-
-@InCollection{WeinbergerK2006,
-  author =       "Kilian Q. Weinberger and John Blitzer and Lawrence K. Saul",
-  editor =       NIPS18ed,
-  booktitle =    NIPS18,
-  title =        "Distance Metric Learning for Large Margin Nearest
-                 Neighbor Classification",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "1473--1480",
-  year =         "2006",
-}
-
-@conference{WeinbergerK2007,
-  author = {Kilian Q. Weinberger and Gerald Tesauro},
-  title = {Metric Learning for Kernel Regression},
-  booktitle = {Proc. of the 11 thInternational Conference on Artificial Intelligence and Statistics},
-  year = {2007},
-}
-  %url = {http://www.stat.umn.edu/~aistat/proceedings/data/papers/077.pdf}
-
-@Article{Weingartner,
-  author =       "H. M. Weingartner and D. N. Ness",
-  title =        "Methods for the Solution of the Multi-Dimensional 0/1
-                 Knapsack Problem",
-  journal =      "Operations Research",
-  volume =       "15",
-  pages =        "83--103",
-  year =         "1967",
-}
-
-@Article{Weisbuch85,
-  author =       "G. Weisbuch and F. Fogelman-Souli\'e",
-  title =        "Scaling Laws for the Attractors of Hopfield Networks",
-  journal =      jppl,
-  volume =       "46",
-  pages =        "623--630",
-  year =         "1985",
-}
-
-@InProceedings{Weiss-99,
-  author =       "Yair Weiss",
-  booktitle =    ICCV99,
-  title =        "Segmentation using eigenvectors: a unifying view",
-  pages =        "975--982",
-  year =         "1999",
-}
-
-@Article{Weiss2000,
-  author =       "Yair Weiss",
-  title =        "Correctness of local probability propagation in
-                 graphical models with loops",
-  journal =      "Neural Computation",
-  volume =       "12",
-  pages =        "1--41",
-  year =         "2000",
-}
-
-@Book{Weiss90,
-  author =       "S. M. Weiss and C. A. Kulikowski",
-  title =        "Computer Systems That Learn",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1990",
-}
-
-@InProceedings{Welling05,
-  author =       "Max Welling and Michal Rosen-Zvi and Geoffrey E. Hinton",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Exponential Family Harmoniums with an Application to
-                 Information Retrieval",
-  volume =       "17",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2005",
-}
-
-@InProceedings{Welling05-small,
-  author =       "M. Welling and M. Rosen-Zvi and G. E. Hinton",
-  booktitle =    "NIPS 17",
-  title =        "Exponential Family Harmoniums with an Application to
-                 Information Retrieval",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  year =         "2005",
-}
-
-@InProceedings{Welling2003,
-  author =       "Max Welling and Richard Zemel and Geoffrey E. Hinton",
-  editor =       NIPS15ed,
-  booktitle =    NIPS15,
-  title =        "Self-Supervised Boosting",
-  publisher =    "{MIT} Press",
-  pages =        "665--672",
-  year =         "2003",
-}
-
-@InProceedings{WellingM2002,
-  author =       "Max Welling and Geoffrey E. Hinton",
-  booktitle =    "ICANN '02: Proceedings of the International Conference
-                 on Artificial Neural Networks",
-  title =        "A New Learning Algorithm for Mean Field {Boltzmann}
-                 Machines",
-  publisher =    "Springer-Verlag",
-  address =      "London, UK",
-  pages =        "351--357",
-  year =         "2002",
-  ISBN =         "3-540-44074-7",
-}
-
-@InProceedings{WellingNIPS17,
-  author =       "Max Welling and Michal Rosen-Zvi and Geoffrey E. Hinton",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "Exponential Family Harmoniums with an Application to
-                 Information Retrieval",
-  publisher =    "{MIT} Press",
-  address =      {Cambridge, MA},
-  pages =        "1481--1488",
-  year =         "2005",
-}
-
-@InProceedings{WellingNIPS17-small,
-  author =       "M. Welling and M. Rosen-Zvi and G. E. Hinton",
-  booktitle =    "NIPS 17",
-  title =        "Exponential Family Harmoniums with an Application to
-                 Information Retrieval",
-  publisher =    "{MIT} Press",
-  year =         "2005",
-}
-
-
-@InProceedings{WellingUAI2009,
-  author =       "Max Welling",
-  booktitle =    UAI09,
-  title =        "Herding Dynamic Weights for Partially Observed Random Field Models",
-  publisher =    "Morgan Kaufmann",
-  year =         "2009",
-}
-
-@InProceedings{WellingICML2009,
-  author =       "Max Welling",
-  booktitle =    ICML09,
-  editor =       ICML09ed,
-  publisher =    ICML09publ,
-  title =        {Herding Dynamic Weights to Learn},
-  year =         "2009",
-}
-
-@InProceedings{Werbos-icnn88,
-  author =       "P. J. Werbos",
-  booktitle =    icnn,
-  title =        "Back-Propagation: Past and Future",
-  publisher =    "IEEE Press",
-  address =      "New York, NY",
-  year =         "1988",
-  OPTpages =     "343--353",
-}
-
-@PhdThesis{Werbos74,
-  author =       "P. Werbos",
-  title =        "Beyond Regression: New Tools for Prediction and
-                 Analysis in the Behavioral Sciences",
-  school =       "Harvard University",
-  year =         "1974",
-}
-
-@Article{Werbos87,
-  author =       "P. J. Werbos",
-  title =        "Building and Understanding Adaptive Systems: {A}
-                 Statistical/Numerical Approach to Factory Automation
-                 and Brain Research",
-  journal =      ieeesmc,
-  volume =       "17",
-  pages =        "7--20",
-  year =         "1987",
-}
-
-@Article{Werbos88,
-  author =       "P. J. Werbos",
-  title =        "Generalization of Backpropagation with Application to
-                 a Recurrent Gas Market Model",
-  journal =      nn,
-  volume =       "1",
-  pages =        "339--356",
-  year =         "1988",
-}
-
-@InProceedings{wermuth+cox92,
-  author =       "N. Wermuth and D. R. Cox",
-  booktitle =    "Proceedings of the 10th Symposium on Computational
-                 Statistics",
-  title =        "Graphical models for dependencies and associations",
-  volume =       "1",
-  address =      "Physica, Heidelberg",
-  pages =        "235--249",
-  year =         "1992",
-}
-
-@Article{wermuth+lauritzen90,
-  author =       "N. Wermuth and S. L. Lauritzen",
-  title =        "On substantive research hypotheses, conditional
-                 independence graphs and graphical chain models",
-  journal =      "J. Roy. Statist. Soc. Ser. B",
-  volume =       "52",
-  pages =        "21--72",
-  year =         "1990",
-}
-
-@Article{Wessels-trnn92,
-  author =       "L. F. A. Wessels and E. Barnad",
-  title =        "Avoiding False Local Minima by Proper Initialization
-                 of Connections",
-  journal =      ieeetrnn,
-  volume =       "3",
-  number =       "6",
-  pages =        "899--905",
-  year =         "1992",
-}
-
-@Article{weston03zeronorm,
-  author =       "Jason Weston and Andr\'e Elisseeff and Bernhard
-                 Sch{\"o}lkopf and Mike Tipping",
-  title =        "Use of the zero norm with linear models and kernel
-                 methods",
-  journal =      jmlr,
-  volume =       "3",
-  publisher =    "MIT Press",
-  pages =        "1439--1461",
-  year =         "2003",
-  ISSN =         "1533-7928",
-}
-
-@InProceedings{weston99density,
-  author =       "J. Weston and A. Gammerman and M. Stitson and V.
-                 Vapnik and V. Vovk and C. Watkins",
-  editor =       "B. {Sch\"olkopf} and C. J. C. Burges and A. J. Smola",
-  booktitle =    "Advances in Kernel Methods --- Support Vector
-                 Learning",
-  title =        "Density estimation using support vector machines",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "293--306",
-  year =         "1999",
-}
-
-@InProceedings{WestonJ2008,
-  author =       "Jason Weston and {Fr\'ed\'eric} Ratle and Ronan
-                 Collobert",
-  booktitle =    ICML08,
-  editor =       ICML08ed,
-  publisher =    ICML08publ,
-  title =        "Deep Learning via Semi-Supervised Embedding",
-  year =         "2008",
-  isbn =         {978-1-60558-205-4},
-  pages =        {1168--1175},
-  location =     {Helsinki, Finland},
-  doi =          {http://doi.acm.org/10.1145/1390156.1390303},
-  address =      {New York, NY, USA},
-}
-  %url =          "http://www.kyb.tuebingen.mpg.de/bs/people/weston/papers/deep-embed.pdf",
-
-@InProceedings{WestonJ2008-small,
-  author =       "J. Weston and F. Ratle and R. Collobert",
-  booktitle =    "ICML 2008",
-  title =        "Deep Learning via Semi-Supervised Embedding",
-  year =         "2008",
-}
-
-@InProceedings{WestonJ2008-short,
-  author =       "J. Weston and F. Ratle and R. Collobert",
-  booktitle =    "Int. Conf. Mach. Learn. 2008",
-  title =        "Deep Learning via Semi-Supervised Embedding",
-  year =         "2008",
-  pages = {1168--1175},
-}
-
-@InProceedings{MobahiCollobertWestonICML2009,
-  author =    {Hossein Mobahi and Ronan Collobert and Jason Weston},
-  title =     {Deep Learning from Temporal Coherence in Video},
-  booktitle = {Proceedings of the 26th International Conference on Machine Learning},
-  pages =     {737--744},
-  year =      2009,
-  editor =    {L\'{e}on Bottou and Michael Littman},
-  address =   {Montreal},
-  month =     {June},
-  publisher = {Omnipress}
-}
-
-@Article{White89,
-  author =       "H. White",
-  title =        "Learning in Artificial Neural Networks: {A}
-                 Statistical Perspective",
-  journal =      "Neural Computation",
-  volume =       "1",
-  type =         "Review",
-  number =       "4",
-  pages =        "425--464",
-  year =         "1989",
-}
-
-@Article{White90,
-  author =       "H. White",
-  title =        "Connectionist nonparametric regression: {Multilayer}
-                 feedforward networks can learn arbitrary mappings",
-  journal =      "Neural Networks",
-  volume =       "3",
-  number =       "5",
-  publisher =    "Pergamon Press Ltd., Inc.",
-  pages =        "535--549",
-  year =         "1990",
-}
-
-@InProceedings{White91,
-  author =       "H. White",
-  booktitle =    "?",
-  title =        "An overview of representation and convergence results
-                 for multilayer feedforward networks",
-  pages =        "",
-  year =         "1991",
-}
-
-@InProceedings{Whitley89,
-  author =       "D. Whitley and T. Hanson",
-  editor =       "J. D. Schaffer",
-  booktitle =    "Proceedings of the Third International Conference on
-                 Genetic Algorithms",
-  title =        "Optimizing Neural Networks Using Faster, More Accurate
-                 Genetic Search",
-  publisher =    "Morgan Kaufmann, San Mateo",
-  address =      "Arlington 1989",
-  pages =        "391--396",
-  year =         "1989",
-}
-
-@Book{whittaker90,
-  author =       "J. Whittaker",
-  title =        "Graphical Models in Applied Multivariate Statistics",
-  publisher =    "Wiley, Chichester",
-  year =         "1990",
-}
-
-@InCollection{Widrow60,
-  author =       "B. Widrow and M. E. Hoff",
-  booktitle =    "1960 IRE WESCON Convention Record",
-  title =        "Adaptive Switching Circuits",
-  volume =       "4",
-  publisher =    "IRE",
-  address =      "New York",
-  pages =        "96--104",
-  year =         "1960",
-}
-
-@InProceedings{Widrow62,
-  author =       "B. Widrow",
-  editor =       "M. C. Yovits and G. T. Jacobi and G. D. Goldstein",
-  booktitle =    "Self-Organizing Systems 1962",
-  title =        "Generalization and Information Storage in Networks of
-                 Adaline ``Neurons''",
-  publisher =    "Spartan, Washington",
-  address =      "Chicago 1962",
-  pages =        "435--461",
-  year =         "1962",
-}
-
-@Article{Widrow73,
-  author =       "B. Widrow and N. K. Gupta and S. Maitra",
-  title =        "Punish/Reward: Learning with a Critic in Adaptive
-                 Threshold Systems",
-  journal =      ieeesmc,
-  volume =       "3",
-  pages =        "455--465",
-  year =         "1973",
-}
-
-@Book{Wiener48,
-  author =       "N. Wiener",
-  title =        "Cybernetics, or Control and Communication in the
-                 Animal and the Machine",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1948",
-}
-
-@Book{Wiener49,
-  author =       "N. Wiener",
-  title =        "The Extrapolation, Interpolation and Smoothing of
-                 Stationary Time Series with Engineering Applications",
-  publisher =    "Wiley",
-  address =      "New York",
-  year =         "1949",
-}
-
-@Article{Wilbur+Lipman83,
-  author =       "W. J. Wilbur and D. J. Lipman",
-  title =        "Rapid similarity searches of nucleic acids and protein
-                 data banks",
-  journal =      "Proc. Natl. Acad. Sci. USA",
-  volume =       "80",
-  pages =        "726--730",
-  year =         "1983",
-}
-
-@TechReport{Wilks1996,
-  author =       "Yorick Wilks and Mark Stevenson",
-  title =        "The grammar of sense: Is word sense tagging much more
-                 than part-of-speech tagging?",
-  institution =  "University of Sheffield",
-  year =         "1996",
-}
-
-@Article{Williams+Barclay88,
-  author =       "A. F. Williams and A. N. Barclay",
-  title =        "The immunoglobulin superfamily domains for cell
-                 surface recognition",
-  journal =      "Annual Review of Immunology",
-  volume =       "6",
-  pages =        "381--405",
-  year =         "1988",
-}
-
-@InProceedings{Williams+Rasmussen-nips8,
-  author =       "C. K. I. Williams and C. E. Rasmussen",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Gaussian Processes for Regression",
-  publisher =    "MIT Press, Cambridge, MA",
-  pages =        "514--520",
-  year =         "1996",
-}
-
-@InProceedings{Williams+Seeger-2000,
-  author =       "C. K. I. Williams and M. Seeger",
-  booktitle =    "Proceedings of the Seventeenth International
-                 Conference on Machine Learning",
-  title =        "The Effect of the Input Density Distribution on
-                 Kernel-based Classifiers",
-  publisher =    "Morgan Kaufmann",
-  year =         "2000",
-}
-
-@InProceedings{Williams+Seeger-2001,
-  author =       "Christopher K. I. Williams and Matthias Seeger",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "Using the {Nystr{\"o}m} Method to Speed Up Kernel
-                 Machines",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "682--688",
-  year =         "2001",
-}
-
-@InProceedings{Williams2001,
-  author =       "C. K. I. Williams",
-  editor =       NIPS13ed,
-  booktitle =    NIPS13,
-  title =        "On a Connection between Kernel {PCA} and Metric
-                 Multidimensional Scaling",
-  publisher =    "{MIT} Press",
-  pages =        "675--681",
-  year =         "2001",
-}
-
-@InProceedings{Williams87,
-  author =       "R. J. Williams",
-  editor =       "M. Caudill and C. Butler",
-  booktitle =    icnn,
-  title =        "A Class of Gradient-Estimating Algorithms for
-                 Reinforcement Learning in Neural Networks",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1987",
-  pages =        "601--608",
-  year =         "1987",
-}
-
-@InProceedings{Williams88a,
-  author =       "R. J. Williams",
-  booktitle =    icnn,
-  title =        "On the Use of Back-Propagation in Associative
-                 Reinforcement Learning",
-  volume =       "1",
-  publisher =    "IEEE, New York",
-  address =      "San Diego 1988",
-  pages =        "263--270",
-  year =         "1988",
-}
-
-@TechReport{Williams88b,
-  author =       "R. J. Williams",
-  title =        "Towards a Theory of Reinforcement-Learning
-                 Connectionist Systems",
-  number =       "NU--CCS--88--3",
-  institution =  "College of Computer Science, Northeastern University",
-  address =      "Boston, MA",
-  year =         "1988",
-}
-
-@InProceedings{Williams89a,
-  author =       "R. J. Williams and J. Peng",
-  booktitle =    ijcnn,
-  title =        "Reinforcement Learning Algorithms As Function
-                 Optimizers",
-  volume =       "2",
-  publisher =    "IEEE, New York",
-  address =      "Washington 1989",
-  pages =        "89--95",
-  year =         "1989",
-}
-
-@Article{Williams89b,
-  author =       "R. J. Williams and D. Zipser",
-  title =        "A Learning Algorithm for Continually Running Fully
-                 Recurrent Neural Networks",
-  journal =      nc,
-  volume =       "1",
-  pages =        "270--280",
-  year =         "1989",
-}
-
-@Article{Williams89c,
-  author =       "R. J. Williams and D. Zipser",
-  title =        "Experimental Analysis of the Real-Time Recurrent
-                 Learning Algorithm",
-  journal =      connsci,
-  volume =       "1",
-  pages =        "87--111",
-  year =         "1989",
-}
-
-@InProceedings{Williams93,
-  author =       "William Evans and Sridhar Rajagopalan and Umesh
-                 Vazirani",
-  booktitle =    "Proceedings of the 6th Annual Conference on
-                 Computational Learning Theory",
-  title =        "Choosing a Reliable Hypothesis",
-  publisher =    "ACM Press",
-  address =      "Santa Cruz, CA, USA",
-  pages =        "269--276",
-  month =        jul,
-  year =         "1993",
-  ISBN =         "0-89791-611-5",
-}
-
-@InProceedings{williams95gaussian,
-  author =       "Christopher K. I. Williams and Carl Edward Rasmussen",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "{Gaussian} Processes for Regression",
-  volume =       "8",
-  publisher =    "{MIT} Press",
-  year =         "1995",
-  ISBN =         "0-262-20107-0",
-}
-
-@InProceedings{Williams96-nips,
-  author =       "C. K. I. Williams",
-  editor =       NIPS9ed,
-  booktitle =    NIPS9,
-  title =        "Computing with infinite networks",
-  publisher =    "MIT Press",
-  year =         "1997",
-}
-
-@InProceedings{WilliamsC1990,
-  author = 	 {Christopher K. I. Williams and Geoffrey E. Hinton},
-  title = 	 {Mean field networks that learn to discriminate temporally distorted strings},
-  booktitle = {Connectionist Models: Proceedings of the 1990 Connectionist Summer School},
-  year = 	 {1990},
-  address = 	 {San Mateo, CA},
-}
-
-@Article{Willshaw69,
-  author =       "D. J. Willshaw and O. P. Buneman and H. C.
-                 Longuet-Higgins",
-  title =        "Non-Holographic Associative Memory",
-  journal =      nature,
-  volume =       "222",
-  year =         "1969",
-}
-
-@Article{Willshaw76,
-  author =       "D. J. Willshaw and C. von der Malsburg",
-  title =        "How Patterned Neural Connections Can Be Set Up by
-                 Self-Organization",
-  journal =      PRSLB,
-  volume =       "194",
-  pages =        "431--445",
-  year =         "1976",
-}
-
-@Article{Wilson-2003,
-  author =       "D. Randall Wilson and Tony R. Martinez",
-  title =        "The general inefficiency of batch training for
-                 gradient descent learning",
-  journal =      "Neural Networks",
-  volume =       "16",
-  number =       "10",
-  publisher =    "Elsevier Science Ltd.",
-  address =      "Oxford, UK",
-  pages =        "1429--1451",
-  year =         "2003",
-  ISSN =         "0893-6080",
-}
-
-@InProceedings{Wilson2007,
-  author =       "D. Keith Wilson",
-  booktitle =    "Proceedings of NOISE-CON 2007",
-  title =        "Weather effects and outdoor noise exposure: Where,
-                 when, and how often to measure?",
-  address =      "Reno, Nevada",
-  year =         "2007",
-}
-
-@Article{Wilson73,
-  author =       "H. R. Wilson and J. D. Cowan",
-  title =        "A Mathematical Theory of the Functional Dynamics of
-                 Cortical and Thalamic Nervous Tissue",
-  journal =      kyb,
-  volume =       "13",
-  pages =        "55--80",
-  year =         "1973",
-}
-
-@Article{Wilson88,
-  author =       "G. V. Wilson and G. S. Pawley",
-  title =        "On the Stability of the Travelling Salesman Problem
-                 Algorithm of Hopfield and Tank",
-  journal =      biocyb,
-  volume =       "58",
-  pages =        "63--70",
-  year =         "1988",
-}
-
-@InProceedings{wilson97instance,
-  author =       "D. Randall Wilson and Tony R. Martinez",
-  booktitle =    "Proc. 14th International Conference on Machine
-                 Learning",
-  title =        "Instance pruning techniques",
-  publisher =    "Morgan Kaufmann",
-  pages =        "403--411",
-  year =         "1997",
-  URL =          "citeseer.nj.nec.com/wilson97instance.html",
-}
-
-@Book{Winograd63,
-  author =       "S. Winograd and J. D. Cowan",
-  title =        "Reliable Computation in the Presence of Noise",
-  publisher =    "MIT Press",
-  address =      "Cambridge",
-  year =         "1963",
-}
-
-@Article{Winters89,
-  author =       "J. H. Winters and C. Rose",
-  title =        "Minimum Distance Automata in Parallel Networks for
-                 Optimum Classification",
-  journal =      nn,
-  volume =       "2",
-  pages =        "127--132",
-  year =         "1989",
-}
-
-@Article{WisSej2002,
-  author =       "L. Wiskott and T. J. Sejnowski",
-  title =        "Slow Feature Analysis: Unsupervised Learning of
-                 Invariances",
-  journal =      "Neural Computation",
-  volume =       "14",
-  number =       "4",
-  pages =        "715--770",
-  year =         "2002",
-  uralbstract =  "{http://itb.biologie.hu-berlin.de/~wiskott/Abstracts/WisSej2002.html}",
-  urlpaper =     "{http://itb.biologie.hu-berlin.de/~wiskott/Publications/WisSej2002-LearningInvariances-NC.ps.gz}",
-}
-
-@TechReport{Witbrock+Zagha-1989,
-  author =       "Michael Witbrock and Marco Zagha",
-  title =        "An Implementation of Back-Propagation Learning on
-                 {GF11}, a Large {SIMD} Parallel Computer",
-  number =       "CMU-CS-89-208",
-  institution =  "Carnegie Mellon University",
-  year =         "1989",
-}
-
-@Book{Wittgenstein58,
-  author =       "L. Wittgenstein",
-  title =        "Philosophical Investigations",
-  publisher =    "Blackwell",
-  address =      "Oxford",
-  year =         "1958",
-}
-
-@InProceedings{Wittner88,
-  author =       "B. S. Wittner and J. S. Denker",
-  editor =       nips87ed,
-  booktitle =    nips87,
-  title =        "Strategies for Teaching Layered Networks
-                 Classification Tasks",
-  publisher =    "American Institute of Physics, New York",
-  address =      "Denver, CO",
-  pages =        "850--859",
-  year =         "1988",
-}
-
-@Book{WL90,
-  author =       "A. Waibel and K. F. Lee",
-  title =        "Readings in Speech Recognition",
-  publisher =    "Morgan Kaufmann",
-  year =         "1990",
-}
-
-@Article{Wolpert-1996,
-  author =       "D. H. Wolpert",
-  title =        "The lack of a priori distinction between learning
-                 algorithms",
-  journal =      "Neural Computation",
-  volume =       "8",
-  number =       "7",
-  pages =        "1341--1390",
-  year =         "1996",
-}
-
-@Article{Wolpert92,
-  author =       "D. H. Wolpert",
-  title =        "Stacked Generalization",
-  journal =      "Neural Networks",
-  volume =       "5",
-  pages =        "241--249",
-  year =         "1992",
-}
-
-@TechReport{wolpert95,
-  author =       "D. Wolpert and W. Macready",
-  title =        "No free lunch theorems for search",
-  number =       "SFI-TR-95-02-010",
-  institution =  "The Santa Fe Institute",
-  year =         "1995",
-}
-
-@article{wolpert96no,
-  author =       "D. Wolpert and W. MacReady",
-  title =        "No free lunch theorems for optimization",
-  year =         "1997",
-  journal =      "IEEE Transactions on Evolutionary Computation",
-  volume =       1,
-  pages =       {67--82},
-}
-
-@Book{wordnet-book98,
-  author =       "Christiane Fellbaum",
-  title =        "{WordNet}: An Electronic Lexical Database",
-  publisher =    "MIT Press",
-  year =         "1998",
-}
-
-@TechReport{wrong-delve-citation,
-  author =       "G. Hinton and R. Neal and R. Tibshirani",
-  title =        "Assessing learning procedures using {DELVE}",
-  institution =  "University of Toronto, Department of Computer Science,
-                 http://www.cs.utoronto.ca/neuron/delve/delve.html.",
-  year =         "1995",
-}
-
-@Article{Wu-97,
-  author =       "Zhijun Wu",
-  title =        "Global continuation for distance geometry problems",
-  journal =      "{SIAM} Journal of Optimization",
-  volume =       "7",
-  pages =        "814--836",
-  year =         "1997",
-}
-
-@Article{Wu-97-short,
-  author =       "Z. Wu",
-  title =        "Global continuation for distance geometry problems",
-  journal =      "{SIAM} J. Optimization",
-  volume =       "7",
-  pages =        "814--836",
-  year =         "1997",
-}
-
-@Article{Wu97,
-  author =       "C. H. Wu",
-  title =        "Artificial neural networks for molecular sequence
-                 analysis",
-  journal =      "Comp. Chem.",
-  volume =       "21",
-  pages =        "237--256",
-  year =         "1997",
-}
-
-@InProceedings{XingE2005,
-  author =       "Eric P. Xing and Rong Yan and Alexander G. Hauptmann",
-  booktitle =    UAI05,
-  title =        "Mining Associated Text and Images with Dual-Wing
-                 Harmoniums.",
-  publisher =    "AUAI Press",
-  pages =        "633--641",
-  year =         "2005",
-  ISBN =         "0-9749039-1-4",
-  date =         "2007-07-26",
-  OPTcrossref =  "conf/uai/2005",
-  OPTdescription = "dblp",
-  OPTee =        "http://uai.sis.pitt.edu/displayArticleDetails.jsp?mmnu=1&smnu=2&article-id=1184&proceeding-id=21",
-  OPTkeywords =  "dblp",
-}
-  %url =       "http://dblp.uni-trier.de/db/conf/uai/uai2005.html#XingYH05",
-
-@InProceedings{Xu+Rudnicky-2000,
-  author =       "Wei Xu and Alex Rudnicky",
-  booktitle =    "International Conference on Statistical Language
-                 Processing",
-  title =        "Can Artificial Neural Networks Learn Language Models",
-  address =      "Beijing, China",
-  pages =        "M1--13",
-  year =         "2000",
-}
-
-@InProceedings{Xu-Emami-Jelinek-2003,
-  author =       "P. Xu and A. Emami and F. Jelinek",
-  booktitle =    "Proceedings of the 2003 Conference on Empirical
-                 Methods in Natural Language Processing (EMNLP'2003)",
-  title =        "Training Connectionist Models for the Structured
-                 Language Model",
-  volume =       "10",
-  pages =        "160--167",
-  year =         "2003",
-}
-
-@Misc{xu-jordan-94,
-  author =       "L. Xu and M. I. Jordan",
-  title =        "Theoretical and experimental studies of convergence
-                 properties of the {EM} algorithm for unsupervised
-                 learning based on finite mixtures",
-  address =      "Snowbird, UTAH",
-  pages =        "",
-  year =         "1994",
-  note =         "Presented at the Neural Networks for Computing
-                 Conference",
-}
-
-@inproceedings{xuetal04,
-author = "Xu, L. and Neufeld, J. and Larson, B. and Schuurmans, D.",
-title = "Maximum margin clustering",
-editor =       NIPS17ed,
-booktitle =    NIPS17,
-year = 2004,
-}
-
-@inproceedings{Xu-ICML-2006,
-author = "Xu, L. and Wilkinson, D. and Southey, F. and Schuurmans, D.",
-title = "Discriminative unsupervised learning of structured predictors",
-booktitle =    ICML06,
-editor =       ICML06ed,
-publisher =    ICML06publ,
-year = 2006,
-}
-
-@InProceedings{Xu-AAAI-2006,
-  author =       "L. Xu and K. Crammer and D. Schuurmans",
-  booktitle =    "Twenty-first National Conference on Artificial
-                 Intelligence (AAAI-06)",
-  title =        "Robust support vector machine training via convex
-                 outlier ablation",
-  year =         "2006",
-}
-
-
-
-@Misc{YA97a,
-  author =       "Howard Hua Yang and {Shun-ichi} Amari",
-  title =        "Natural Gradient Descent for Training Multi-Layer
-                 Perceptrons",
-  year =         "1997",
-  URL =          "citeseer.ist.psu.edu/hua96natural.html",
-}
-
-@Article{yang98complexity,
-  author =       "Howard Hua Yang and {Shun-ichi} Amari",
-  title =        "Complexity Issues in Natural Gradient Descent Method
-                 for Training Multi-Layer Perceptrons",
-  journal =      "Neural Computation",
-  volume =       "10",
-  number =       "8",
-  pages =        "2137--2157",
-  year =         "1998",
-  URL =          "citeseer.ist.psu.edu/91462.html",
-}
-
-@inproceedings{Yang+al-2006,
-    author = {Xin Yang and Haoying Fu and Hongyuan Zha and Jesse Barlow},
-    title = {Semi-supervised nonlinear dimensionality reduction},
-    booktitle = {Proceedings of the 23rd International Conference on Machine Learning},
-    year = {2006},
-    isbn = {1-59593-383-2},
-    pages = {1065--1072},
-    location = {Pittsburgh, Pennsylvania},
-    doi = {http://doi.acm.org/10.1145/1143844.1143978},
-    publisher = {ACM},
-    address = {New York, NY, USA},
-}
-
-@misc{Yang+Jin-2006,
-    author = {Liu Yang and Rong Jin},
-    title = {Distance Metric Learning: A Comprehensive Survey},
-    year = 2006,
-    note = {url{http://www.cse.msu.edu/~yangliu1/frame\_survey\_v2.pdf}},
-}
-
-@misc{Yang-2007,
-    author = {Liu Yang},
-    title = {An Overview of Distance Metric Learning},
-    year = 2007,
-    note = {url{http://www.cse.msu.edu/~yangliu1/dist\_overview.pdf}},
-}
-
-@InProceedings{YangL2007,
-  author =       "Liu Yang and Rong Jin and Caroline Pantofaru and Rahul
-                 Sukthankar",
-  booktitle =    cvpr07,
-  title =        "Discriminative Cluster Refinement: Improving Object
-                 Category Recognition Given Limited Training Data",
-  month =        jun,
-  year =         "2007",
-}
-
-@InProceedings{Yao85,
-  author =       "Andrew Yao",
-  booktitle =    "Proceedings of the 26th Annual {IEEE} Symposium on
-                 Foundations of Computer Science",
-  title =        "Separating the polynomial-time hierarchy by oracles",
-  pages =        "1--10",
-  year =         "1985",
-}
-
-@InProceedings{Yarowsky-92,
-  author =       "David Yarowsky",
-  booktitle =    "Proceedings of the 14th International Conference on
-                 Computational Linguistics (COLING-92)",
-  title =        "Word-sense disambiguation using statistical models of
-                 {Roget}'s categories trained on large corpora",
-  address =      "Nantes, France",
-  pages =        "454--460",
-  year =         "1992",
-}
-
-@InProceedings{Yarowsky-93,
-  author =       "David Yarowsky",
-  booktitle =    "{ARPA} Workshop on Human Language Technology",
-  title =        "One sense per collocation",
-  address =      "Princeton, {NJ}",
-  year =         "1993",
-}
-
-@InProceedings{Yarowsky-95,
-  author =       "David Yarowsky",
-  booktitle =    "33rd Annual Meeting of the {ACL}",
-  title =        "Unsupervised word sense disambiguation rivaling
-                 supervised methods",
-  address =      "Cambridge, {MA}",
-  pages =        "189--196",
-  year =         "1995",
-}
-
-@InProceedings{Yarowsky1994,
-  author =       "David Yarowsky",
-  booktitle =    "Meeting of the Association for Computational
-                 Linguistics",
-  title =        "Decision Lists for Lexical Ambiguity Resolution:
-                 Application to Accent Restoration in Spanish and
-                 French",
-  pages =        "88--95",
-  year =         "1994",
-  URL =          "citeseer.nj.nec.com/yarowsky94decision.html",
-}
-
-@InProceedings{Yarowsky1995,
-  author =       "David Yarowsky",
-  booktitle =    "Meeting of the Association for Computational
-                 Linguistics",
-  title =        "Unsupervised Word Sense Disambiguation Rivaling
-                 Supervised Methods",
-  pages =        "189--196",
-  year =         "1995",
-  URL =          "citeseer.nj.nec.com/yarowsky95unsupervised.html",
-}
-
-@TechReport{Yianilos95,
-  author =       "Peter N. Yianilos",
-  title =        "Metric Learning via Normal Mixtures",
-  institution =  "NEC Research Institute",
-  address =      "Princeton, NJ",
-  month =        oct,
-  year =         "1995",
-}
-
-@InProceedings{Younes98onthe,
-    author = {Laurent Younes},
-    title = {On The Convergence Of Markovian Stochastic Algorithms With Rapidly Decreasing Ergodicity Rates},
-    booktitle = {Stochastics and Stochastics Models},
-    year = {1998},
-    pages = {177--228}
-}
-
-@Article{Young+Sachs79,
-  author =       "E. D. Young and M. B. Sachs",
-  title =        "Representation of steady-state vowels in the temporal
-                 aspects of the discharge pattern of population of
-                 auditory nerve fibers",
-  journal =      jasa,
-  volume =       "66",
-  number =       "5",
-  pages =        "1381--1403",
-  year =         "1979",
-}
-
-@InProceedings{Yu+Simmons90,
-  author =       "Y. H. Yu and R. F. Simmons",
-  booktitle =    ijcnn,
-  title =        "Extra output biased learning",
-  publisher =    "Lawrence Erlbaum, Hillsdale",
-  address =      "Washington 1990",
-  year =         "1990",
-}
-
-@Article{Yu-trnn92,
-  author =       "X. H. Yu",
-  title =        "Can Backpropagation Error Surface Not Have Local
-                 Minima?",
-  journal =      ieeetrnn,
-  volume =       "3",
-  number =       "6",
-  pages =        "1019--1020",
-  year =         "1992",
-}
-
-@Article{Yu92,
-  author =       "X. H. Yu",
-  title =        "Can Backpropagation Error Surface Not Have Local
-                 Minima?",
-  journal =      ieeetrnn,
-  volume =       "3",
-  number =       "6",
-  pages =        "1019--1020",
-  year =         "1992",
-}
-
-@InProceedings{Yuille2005,
-  author =       "Alan L. Yuille",
-  editor =       NIPS17ed,
-  booktitle =    NIPS17,
-  title =        "The Convergence of Contrastive Divergences",
-  publisher =    "{MIT} Press",
-  pages =        "1593--1600",
-  year =         "2005",
-}
-
-@Article{Yuille89,
-  author =       "Alan L. Yuille and D. M. Kammen and D. S. Cohen",
-  title =        "Quadrature and the Development of Orientation
-                 Selective Cortical Cells by Hebb Rules",
-  journal =      biocyb,
-  volume =       "61",
-  pages =        "183--194",
-  year =         "1989",
-}
-
-@Article{Yuille90,
-  author =       "Alan L. Yuille",
-  title =        "Generalized Deformable Models, Statistical Physics,
-                 and Matching Problems",
-  journal =      "Neural Computation",
-  volume =       "2",
-  number =       "1",
-  pages =        "1--24",
-  year =         "1990",
-}
-
-@Article{Zak-nn92,
-  author =       "M. Zak",
-  title =        "Terminal Attractors in Neural Networks",
-  journal =      nn,
-  volume =       "2",
-  pages =        "259--274",
-  year =         "1989",
-}
-
-@Article{Zak88,
-  author =       "M. Zak",
-  title =        "Terminal Attractors for Addressable Memory in Neural
-                 Networks",
-  journal =      plettA,
-  volume =       "133",
-  pages =        "18--22",
-  year =         "1988",
-}
-
-@Article{Zak89,
-  author =       "M. Zak",
-  title =        "Terminal Attractors in Neural Networks",
-  journal =      nn,
-  volume =       "2",
-  pages =        "259--274",
-  year =         "1989",
-}
-
-@Article{Zavaliagkos93,
-  author =       "G. Zavaliagkos and S. Austin and J. Makhoul and R.
-                 Schwartz",
-  title =        "A Hybrid Continuous Speech Recognition System Using
-                 Segmental Neural Nets with Hidden {Markov} Models",
-  journal =      "Int. Journal of Pattern Recognition and Artificial
-                 Intelligence",
-  pages =        "305--319",
-  year =         "1993",
-  note =         "Special Issue on Applications of Neural Networks to
-                 Pattern Recognition (I. Guyon Ed.)",
-}
-
-@InProceedings{Zell+al-1993,
-  author =       "Andreas Zell and Niels Mache and Michael Vogt and
-                 Markus H{\"u}ttel",
-  booktitle =    "Proceedings of the IEEE International Conference on
-                 Neural Networks",
-  title =        "Problems of Massive Parallelism in Neural Network
-                 Simulation",
-  volume =       "3",
-  address =      "San Francisco, CA",
-  pages =        "1890--1895",
-  year =         "1993",
-}
-
-@InProceedings{Zemel90,
-  author =       "R. S. Zemel and M. C. Mozer and G. E. Hinton",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "Recognizing objects using hierarchical reference frame
-                 transformations",
-  address =      "San Mateo, CA",
-  year =         "1990",
-}
-
-@PhdThesis{Zemel93-thesis,
-  author =       "Richard S. Zemel",
-  title =        "A Minimum Description Length Framework for
-                 Unsupervised Learning",
-  school =       "University of Toronto",
-  year =         "1993",
-}
-
-@InProceedings{Zha2002,
-  author =       "H. Zha and C. Ding and M. Gu and X. He and H. Simon",
-  editor =       NIPS14ed,
-  booktitle =    NIPS14,
-  title =        "Spectral relaxation for {K}-means clustering",
-  publisher =    "{MIT} Press",
-  year =         "2002",
-}
-
-@InProceedings{Zhang-nips90,
-  author =       "X. Zhang and Others",
-  editor =       NIPS2ed,
-  booktitle =    NIPS2,
-  title =        "An Efficient Implementation of the Backpropagation
-                 Algorithm on the Connection Machine {CM}-2",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  pages =        "801--809",
-  year =         "1990",
-}
-
-@Misc{zhang-workshop-2005,
-  author =       "Jian Zhang",
-  title =        "Sparsity Models for Multi-task Learning",
-  howpublished = "'Inductive Transfer: 10 Years Later' NIPS Workshop",
-  year =         "2005",
-  OPTkey =       "",
-}
-
-@TechReport{Zhang2001,
-  author =       "Bin Zhang",
-  title =        "Is the Maximal Margin Hyperplane Special in a Feature
-                 Space?",
-  number =       "HPL-2001-89",
-  institution =  "Hewlett-Packards Labs",
-  year =         "2001",
-}
-
-@article{Zhang+Zha-2005,
-    address = {Philadelphia, PA},
-    author = {Zhang, Zhenyue   and Zha, Hongyuan  },
-    doi = {10.1137/S1064827502419154},
-    issn = {1064-8275},
-    journal = {SIAM Journal on Scientific Computing},
-    number = {1},
-    pages = {313--338},
-    publisher = {Society for Industrial and Applied Mathematics},
-    title = {Principal Manifolds and Nonlinear Dimensionality Reduction via Tangent Space Alignment},
-    url = {http://portal.acm.org/citation.cfm?id=1024004.1039898},
-    volume = {26},
-    year = {2005}
-}
-
-@InProceedings{Zhang+al-2007,
-    author = {D. Zhang and Z. H. Zhou and S. Chen},
-    title = {Semi-supervised dimensionality reduction},
-    booktitle = {Proceedings of the 7th SIAM International Conference on Data Mining},
-    address = {Minneapolis, MN},
-    year = 2007,
-}
-
-@article{Zhao+al-2006,
-    author = {Haitao Zhao and Shaoyuan Sun and Zhongliang Jing and Jingyu Yang},
-    title = {Local structure based supervised feature extraction},
-    journal = {Pattern Recognition},
-    volume = {39},
-    number = {8},
-    year = {2006},
-    issn = {0031-3203},
-    pages = {1546--1550},
-    doi = {http://dx.doi.org/10.1016/j.patcog.2006.02.023},
-    publisher = {Elsevier Science Inc.},
-    address = {New York, NY, USA},
-}
-
-@InProceedings{Zhou+al-2004,
-  author =       "D. Zhou and O. Bousquet and T. {Navin Lal} and J.
-                 Weston and B. Sch{\"o}lkopf",
-  editor =       NIPS16ed,
-  booktitle =    NIPS16,
-  title =        "Learning with local and global consistency",
-  publisher =    "MIT Press",
-  address =      "Cambridge, MA",
-  pages =        "321--328",
-  year =         "2004",
-  keywords =     "semi-supervised learning, manifold, kernel methods",
-}
-
-@InProceedings{Zhou+Dapkus-1995,
-  author =       "J. Zhou and P. Dapkus",
-  booktitle =    "Proceedings of the Third Workshop on Very Large
-                 Corpora",
-  title =        "Automatic Suggestion of Significant Terms for a
-                 Predefined Topic",
-  address =      "Cambridge",
-  pages =        "131--147",
-  year =         "1995",
-}
-
-@InProceedings{Zhou+Tanner-1997,
-  author =       "Joe Zhou and Troy Tanner",
-  booktitle =    "Proceedings of the fifth conference on Applied natural
-                 language processing",
-  title =        "Construction and visualization of key term
-                 hierarchies",
-  publisher =    "Morgan Kaufmann Publishers Inc.",
-  address =      "San Francisco, CA, USA",
-  pages =        "307--311",
-  year =         "1997",
-  location =     "Washington, DC",
-}
-
-@InProceedings{zhou2002,
-  author =       "Z.-H. Zhou and M.-L. Zhang",
-  booktitle =    "Proceedings of the International Conference on
-                 Intelligent Information Technology, 2002, pp.455-459",
-  title =        "Neural Networks for Multi-Instance Learning",
-  address =      "Beijing, China",
-  year =         "2002",
-  page =         "455-459",
-}
-
-@InProceedings{ZhouX2007,
-  author =       "Xiaojin Zhu and Timothy J. Rogers and Ruichen Qian and
-                 Chuck Kalish",
-  booktitle =    "AAAI",
-  title =        "Humans Perform Semi-Supervised Classification Too.",
-  publisher =    "AAAI Press",
-  pages =        "864",
-  year =         "2007",
-  ISBN =         "978-1-57735-323-2",
-  URL =          "http://dblp.uni-trier.de/db/conf/aaai/aaai2007.html#ZhuRQK07",
-  date =         "2007-09-05",
-  description =  "dblp",
-  keywords =     "dblp",
-}
-
-@article{Zhu2009,
- author = {Long Zhu and Yuanhao Chen and Alan Yuille},
- title = {Unsupervised Learning of Probabilistic Grammar-Markov Models for Object Categories},
- journal = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
- volume = 31,
- number = 1,
- pages = {114--128},
- year = 2009,
-}
-
-@InProceedings{Zhu+al-2003,
-  author =       "Xiaojin Zhu and Zoubin Ghahramani and John Lafferty",
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  title =        "Semi-supervised learning using {Gaussian} fields and
-                 harmonic functions",
-  pages =        "912--919",
-  year =         "2003",
-}
-
-@TechReport{Zhu+al-TR2003,
-  author =       "Xiaojin Zhu and John Lafferty and Zoubin Ghahramani",
-  title =        "Semi-Supervised Learning: From {G}aussian Fields to
-                 {G}aussian Processes",
-  number =       "CMU-CS-03-175",
-  institution =  "CMU",
-  year =         "2003",
-}
-
-@Article{Zhu-2006,
-  author =       "M. Zhu and W. Su and H. A. Chipman",
-  title =        "{LAGO}: {A} computationally efficient approach for
-                 statistical detection",
-  journal =      "Technometrics",
-  volume =       "48",
-  number =       "2",
-  pages =        "193--205",
-  year =         "2006",
-}
-
-@InProceedings{Zhu-ijcai-2005,
-  author =       "Tingshao Zhu and Russ Greiner and Gerald Haeubl and
-                 Kevin Jewell and Bob Price",
-  booktitle =    "Nineteenth International Joint Conference on
-                 Artificial Intelligence (IJCAI-05)",
-  title =        "Using Learned Browsing Behavior Models to Recommend
-                 Relevant Web Pages",
-  address =      "Edinburgh, U.K.",
-  pages =        "1589--1591",
-  year =         "2005",
-}
-
-@TechReport{Zhu-Lafferty-Ghahramani-2003,
-  author =       "Xiaojin Zhu and John Lafferty and Zoubin Ghahramani",
-  title =        "Semi-supervised learning: from {G}aussian fields to
-                 {G}aussian processes",
-  number =       "CMU-CS-03-175",
-  institution =  "School of Computer Science, Carnegie Mellon
-                 University",
-  year =         "2003",
-}
-
-@Article{zhu-rohwer96,
-  author =       "H. Zhu and R. Rohwer",
-  title =        "No free lunch for cross validation",
-  journal =      "Neural Computation",
-  volume =       "8",
-  number =       "7",
-  pages =        "1421--1426",
-  year =         "1996",
-}
-
-@TechReport{zhu05survey,
-  author =       "Xiaojin Zhu",
-  title =        "Semi-Supervised Learning Literature Survey",
-  number =       "1530",
-  institution =  "Computer Science, University of Wisconsin-Madison",
-  year =         "2005",
-  note =         "http://www.cs.wisc.edu/$\sim$jerryzhu/pub/ssl\-survey.pdf",
-}
-
-@TechReport{ZhuX2002,
-  author =       "Xiaojin Zhu and Zoubin Ghahramani",
-  title =        "Towards semisupervised classification with Markov
-                 random fields",
-  institution =  "Carnegie Mellon University",
-  year =         "2002",
-}
-
-@inproceedings{Zinkevich-2003,
-  author = {Martin Zinkevich},
-  title ={Online convex programming and generalized infinitesimal gradient ascent},
-  booktitle =    ICML03,
-  editor =       ICML03ed,
-  publisher =    ICML03publ,
-  pages =        "928--936",
-  year =         "2003",
-}
-
-@InProceedings{Zoubin-nips8,
-  author =       "Z. Ghahramani and M. I. Jordan",
-  editor =       NIPS8ed,
-  booktitle =    NIPS8,
-  title =        "Factorial Hidden Markov Models",
-  publisher =    "MIT Press, Cambridge, MA",
-  year =         "1996",
-}
-
-@InProceedings{Zoubin-nips94,
-  author =       "Z. Ghahramani and M. I. Jordan",
-  editor =       NIPS6ed,
-  booktitle =    NIPS6,
-  title =        "Supervised learning from incomplete data via an {EM}
-                 approach",
-  publisher =    "Morgan Kaufmann",
-  address =      "San Mateo, CA",
-  year =         "1994",
-}
-
-@TechReport{Zoubin-tr93,
-  author =       "Z. Ghahramani and M. I. Jordan",
-  title =        "Function approximation via density estimation using
-                 the {E}{M} approach",
-  type =         "Computational Cognitive Science",
-  number =       "TR 9304",
-  institution =  "MIT",
-  year =         "1993",
-}
-
-@TechReport{Zoubin96,
-  author =       "Z. Ghahramani and G. E. Hinton",
-  title =        "Parameter estimation for linear dynamical systems",
-  number =       "Technical Report CRG-TR-91-1",
-  institution =  "University of Toronto",
-  year =         "1996",
-}
-
-@TechReport{Zoubin96b,
-  author =       "Z. Ghahramani and G. E. Hinton",
-  title =        "Switching state-space models",
-  number =       "Technical Report CRG-TR-91-3",
-  institution =  "University of Toronto",
-  year =         "1996",
-}
-
-@Article{Zue90a,
-  author =       "V. Zue and S. Seneff and J. Glass",
-  title =        "Speech database development: {TIMIT} and beyond",
-  journal =      spcomm,
-  volume =       "9",
-  number =       "4",
-  pages =        "351--356",
-  month =        aug,
-  year =         "1990",
-}
-
-@InProceedings{Zue90b,
-  author =       "V. Zue and J. Glass and D. Goddeau and D. Goodine and
-                 H. Leung and M. McCandless and M. Phillips and J.
-                 Polifroni and S. Seneff and D. Whitney",
-  booktitle =    "Proc. Int. Conf. Spoken Languague Processing",
-  title =        "Recent progress on the {MIT} {VOYAGER} spoken language
-                 system",
-  address =      "Kobe, Japan",
-  pages =        "29.6.1",
-  year =         "1990",
-}
-
-@InProceedings{Zwald+al-2004,
-  author =       "Laurent Zwald and Olivier Bousquet and Gilles
-                 Blanchard",
-  editor =       "John Shawe-Taylor and Yoram Singer",
-  booktitle =    colt04,
-  title =        "Statistical Properties of Kernel Principal Component
-                 Analysis",
-  volume =       "3120",
-  publisher =    "Springer-Verlag",
-  pages =        "594--608",
-  year =         "2004",
-  series =       "Lecture Notes in Computer Science",
-}
-
-@InProceedings{Zweig+Russel-AAAI98,
-  author =       "G. Zweig and S. Russel",
-  booktitle =    "Proceedings of the AAAI Conference",
-  title =        "Speech Recognition with Dynamic {Bayesian} Networks",
-  publisher =    "AAAI Press",
-  address =      "Madison, Wisconsin",
-  year =         "1998",
-}
-
-@InProceedings{Zweig+Russel-ICSLP98,
-  author =       "G. Zweig and S. Russel",
-  booktitle =    "Proceedings of the International Conference on
-                 Statistical Language Processing",
-  title =        "Probabilistic Modeling with {Bayesian} Networks for
-                 {ASR}",
-  address =      "Sidney, Australia",
-  year =         "1998",
-}
-
-@Article{Zwicker+Terhardt80,
-  author =       "E. Zwicker and E. Terhardt",
-  title =        "Analytical expressions for critical band rate and
-                 critical bandwidths as a function of frequency",
-  journal =      jasa,
-  volume =       "68",
-  number =       "5",
-  pages =        "1523--1525",
-  year =         "1980",
-}
-
-@Proceedings{colt03,
-  editor =       "Bernhard Sch{\"o}lkopf and Manfred K. Warmuth",
-  booktitle =    colt03,
-  title =        "Computational Learning Theory and Kernel Machines,
-                 16th Annual Conference on Computational Learning Theory
-                 and 7th Kernel Workshop, {COLT}/Kernel 2003,
-                 Washington, {DC}, {USA}, August 24-27, 2003,
-                 Proceedings",
-  volume =       "2777",
-  publisher =    "Springer",
-  year =         "2003",
-  series =       "Lecture Notes in Computer Science",
-}
-
-@Proceedings{FOCS3,
-  booktitle =    "Proceedings of the Third Annual Symposium on Switching
-                 Circuit Theory and Logical Design",
-  title =        "Proceedings of the Third Annual Symposium on Switching
-                 Circuit Theory and Logical Design",
-  organization = "American Institute of Electrical Engineers",
-  address =      "Chicago, Illinois",
-  month =        "7--12" # oct,
-  year =         "1962",
-  crossrefonly = "1",
-  url =       "http://theory.lcs.mit.edu/~dmjones/FOCS/focs.bib",
-}
-
-@Book{TricksOfTheTrade,
-  editor =       "Genevieve Orr and Klaus-Robert Muller",
-  booktitle =    "Neural networks: tricks of the trade",
-  title =        "Neural networks: tricks of the trade",
-  volume =       "1524",
-  publisher =    "Springer-Verlag Inc.",
-  address =      "New York, NY, USA",
-  pages =        "vi + 432",
-  year =         "1998",
-  ISBN =         "3-540-65311-2 (paperback)",
-  ISSN =         "0302-9743",
-  LCCN =         "QA76.87.N4913 1998",
-  bibdate =      "Sat Jan 9 14:35:31 1999",
-  series =       "Lecture Notes in Computer Science",
-  acknowledgement = ack-nhfb,
-  keywords =     "Neural networks (Computer science)",
-}
-
-@Article{Besag75pseudolikelihood,
-  author =       "Julian Besag",
-  title =        "Statistical analysis of non-lattice data",
-  journal =      "The Statistician",
-  volume =       "24",
-  number =       "3",
-  pages =        "179--195",
-  year =         "1975",
-}
-
-@INPROCEEDINGS{Marlin05unsupervisedlearning,
-    author = {Benjamin Marlin and Richard S. Zemel and Sam T. Roweis},
-    title = {Unsupervised learning with non-ignorable missing data},
-    booktitle = {In Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS 2005)},
-    year = {2005},
-    pages = {222--229}
-}
-
-@PhdThesis{MarlinThesis08,
-  author = "Benjamin M. Marlin",
-  title =  "Missing Data Problems in Machine Learning",
-  school = "Dept. of Computer Science, University of Toronto",
-  year =   "2008"
-}
-
-@inproceedings{odonnellservedio08,
-author = "{O'Donnell}, R. and Servedio, R.",
-title = "The {Chow} parameters problem",
-booktitle = "Proceedings of the Fortieth Annual Symposium on Theory of 
-Computing (STOC)",
-year = 2008,
-pages = "517-526",
-}
-
-@article{bendaviddichterman98,
-author = "{Ben-David}, S. and Dichterman, E.",
-title = "Learning with restricted focus of attention",
-journal = "Journal of Computer and System Sciences",
-volume = 56,
-numer = 3,
-year = 1998,
-pages = "277-298",
-}
-
-@techreport{cma07,
-author = "Canadian Medical Association",
-title = "Information technology and health care in Canada: 2007 status report",
-year = 2007,
-}
-
-@article{hanetal05,
-author = "Y. Han and J. Carcillo and S. Venkataraman and R. Clark and 
-R. Watson and T. Nguyen and H. Bayir and R. Orr",
-title = "Unexpected increased mortality after implementation 
-of a commercially sold computerized physician order entry system",
-journal = "Pediatrics",
-volume = "116",
-number = 6,
-pages = "1506-1512",
-year = 2005,
-}
-
-@InProceedings{conf/uai/McCallum03,
-  title =   "Efficiently Inducing Features of Conditional Random
-         Fields",
-  author =  "Andrew McCallum",
-  booktitle =  UAI03,
-  publisher =   "Morgan Kaufmann",
-  date = "August 7-10",
-  location = "Acapulco, Mexico",
-  year =    "2003",
-  editor =  "Christopher Meek and Uffe Kj{\ae}rulff",
-  ISBN =    "0-127-05664-5",
-  pages =   "403--410",
-}
-
-
-@InProceedings{conf/uai/McCallum03-small,
-  title =   "Efficiently Inducing Features of Conditional Random
-         Fields",
-  author =  "A. McCallum",
-  booktitle =   "UAI",
-  year =    "2003",
-}
-
-
-@InProceedings{conf/icml/RanzatoS08,
-  title =   "Semi-supervised learning of compact document
-         representations with deep networks",
-  author =  "Marc'Aurelio Ranzato and Martin Szummer",
-  booktitle = ICML08,
-  editor =  ICML08ed,
-  publisher = ICML08publ,
-  year =    "2008",
-  volume =  "307",
-  ISBN =    "978-1-60558-205-4",
-  pages =   "792--799",
-  series =  "ACM International Conference Proceeding Series",
-  date =    "June 5-9, 2008",
-  location = "Helsinki, Finland",
-  URL =     "http://doi.acm.org/10.1145/1390156.1390256",
-}
-
-@InProceedings{conf/icml/RanzatoS08-small,
-  title =   "Semi-supervised learning of compact document
-         representations with deep networks",
-  author =  "M. Ranzato and M. Szummer",
-  booktitle =   "ICML",
-  year =    "2008",
-}
-
-@PhdThesis{Cosatto02sample-basedtalking-head,
-    author = {Eric Cosatto and Prof Murat Kunt},
-    title = {Sample-Based Talking-Head Synthesis},
-    institution = {Signal Processing Lab, Swiss Federal Institute of Techology},
-    year = {2002}
-}
-
-@incollection{SutskeverHintonTaylor2009,
- title = {The Recurrent Temporal Restricted Boltzmann Machine},
- author = {Ilya Sutskever and Geoffrey E Hinton and Graham Taylor},
- editor = NIPS21ed,
- booktitle = NIPS21,
- pages = {1601--1608},
- year = {2009}
-}
-
-@TechReport{Bergstra+2009-small,
-  author =       "J. Bergstra and G. Desjardins and P. Lamblin and Y. Bengio",
-  title =        "Quadratic Polynomials Learn Better Image Features",
-  number =       "1337",
-  institution =  "DIRO, Universit\'e de Montr\'eal",
-  year =         "2009",
-}
-
-@inproceedings{Haffner+al-1998,
- author = {Haffner, P. and Bottou, L. and Howard, P. G. and Simard, P. and Bengio, Y. and Cun, Y. Le},
- title = {Browsing through High Quality Document Images with {DjVu}},
- booktitle = {Proceedings of the Advances in Digital Libraries Conference (ADL'98)},
- year = {1998},
- isbn = {0-8186-8464-X},
- pages = {309},
- publisher = {IEEE Computer Society},
- address = {Washington, DC, USA},
- }
-
-@inproceedings{Bottou+Howard+Bengio-1998,
- author = {Bottou, L. and Howard, P. G. and Bengio, Y.},
- title = {The {Z}-Coder Adaptive Binary Coder},
- booktitle = {Proceedings of the Conference on Data Compression (DCC'98)},
- year = {1998},
- pages = {13},
- publisher = {IEEE Computer Society},
- address = {Washington, DC, USA},
- }
-
-@inproceedings{Pigeon+Bengio-1998,
-  author    = {Steven Pigeon and
-               Yoshua Bengio},
-  title     = {A Memory-Efficient Adaptive Huffman Coding Algorithm for
-               Very Large Sets of Symbols},
-  booktitle = {Proceedings of the Conference on Data Compression (DCC'98)},
-  year      = {1998},
-  pages     = {568},
-  ee        = {http://dlib.computer.org/conferen/dcc/8406/pdf/84060568.pdf},
-  bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@INPROCEEDINGS{LeCun+Bottou+Bengio-1997,
-title={Reading checks with multilayer graph transformer networks},
-author={Yann LeCun and Bottou, L. and Bengio, Y.},
-booktitle={IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP'97)},
-year={1997},
-month={Apr},
-volume={1},
-pages={151--154},
-keywords={backpropagation, banking, cheque processing, document image processing, image segmentation, optical character recognitionbusiness checks, business cheques, check reading system, cheque reading system, convolutional neural network character recognizers, gradient-based learning algorithms, graph-based stochastic models, machine learning paradigm, multilayer graph transformer networks, personal checks, personal cheques},
-doi={10.1109/ICASSP.1997.599580},
- }
-
-@INPROCEEDINGS{Rahim97discriminativefeature,
-    author = {Mazin Rahim and Yoshua Bengio and Yann {LeCun}},
-    title = {Discriminative Feature And Model Design For Automatic Speech Recognition},
-    booktitle = {In Proc. of Eurospeech},
-    year = {1997},
-    pages = {75--78}
-}
-
-@InProceedings{Bengio-nncm-1996,
-author = {Yoshua Bengio},
-title = {Training A Neural Network with a Financial Criterion Rather then a Prediction Criterion},
-booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets (NNCM-96)},
-editor = { A.S. Weigend and Y.S Abu-Mostafa and A.-P.N. Regenes},
-publisher = {World Scientific},
-pages = {433--443},
-year = "1997",
-}
-
-@INPROCEEDINGS{Bengio+Bengio+Cloutier-1994,
-title={Use of genetic programming for the search of a new learning rule for neural networks},
-author={Bengio, S. and Bengio, Y. and Cloutier, J.},
-booktitle={Proceedings of the First IEEE Conference on Evolutionary Computation},
-year={1994},
-month={Jun},
-pages={324-327 vol.1},
-keywords={ backpropagation, genetic algorithms, learning (artificial intelligence), neural nets, optimisation, search problems backpropagation algorithm, classification tasks, genetic algorithms, genetic programming, gradient descent, learning rule, neural networks, optimization, parametric function, rule parameters, search, simulated annealing, standard optimization methods},
-doi={10.1109/ICEC.1994.349932},
-}
-
-@article{Chakraborty+al-2002,
- author = {Chakraborty, Basabi and Chakraborty, Goutam},
- title = {A new feature extraction technique for on-line recognition of handwritten alphanumeric characters},
- journal = {Inf. Sci. Appl.},
- volume = {148},
- number = {1-4},
- year = {2002},
- issn = {0020-0255},
- pages = {55--70},
- doi = {http://dx.doi.org/10.1016/S0020-0255(02)00276-1},
- publisher = {Elsevier Science Inc.},
- address = {New York, NY, USA},
- }
-
-
-@INPROCEEDINGS{LeCun+al-1993,
-title={On-Line handwriting recognition with neural networks: spatial representation versus temporal representation},
-author={{LeCun}, Y and Bengio, Y. and Henderson, D. and Weisbuch, A.},
-booktitle={Proceedings of the International Conference on Handwriting and Drawing},
-year={1993},
-location= {Ecole Nationale Superieure des Telecommunications},
-}
-
-@INPROCEEDINGS{Bengio+al-92,
-    author = {Yoshua Bengio and Samy Bengio and Jocelyn Cloutier and Jan Gecsei},
-    title = {On the Optimization of a Synaptic Learning Rule},
-    booktitle = {in Conference on Optimality in Biological and Artificial Networks},
-    year = {1992}
-}
-
-@INPROCEEDINGS{Bengio+al-91,
-    author = {Yoshua Bengio and Samy Bengio and Jocelyn Cloutier and Jan Gecsei},
-    title = {Learning a Synaptic Learning Rule},
-    booktitle = ijcnn,
-    location = "Seattle, WA",
-    pages = "II-A969",
-    year = {1991}
-}
-
-@INPROCEEDINGS{Bengio91acomparative,
-    author = {Yoshua Bengio and Renato De Mori and Giovanni Flammia and Ralf Kompe},
-    title = {A Comparative Study On Hybrid Acoustic Phonetic Decoders Based On Artificial Neural Networks},
-    booktitle = {Proceeding of EuroSpeech},
-    location = {Genova, Italy},
-    year = {1991}
-}
-
-@inproceedings { lecun-01a,
-original =      "orig/lecun-01a.ps.gz",
-author = 	"{LeCun}, Y. and Bottou, L. and Bengio, Y. and Haffner, P.",
-title = 	"Gradient-Based Learning Applied to Document Recognition",
-booktitle =     "Intelligent Signal Processing",
-editors =       "Haykin, S. and Kosko, B.",
-pages =         "306-351",
-publisher =     "IEEE Press",
-note =          "chap. 9",
-year =		2001,
-}
-
-@InCollection{Hochreiter+al-2000,
-    abstract = {Introduction Recurrent networks (crossreference Chapter 12) can, in principle, use their feedback connections to store representations of recent input events in the form of activations. The most widely used algorithms for learning what to put in short-term memory, however, take too much time to be feasible or do not work well at all, especially when minimal time lags between inputs and corresponding teacher signals are long. Although theoretically fascinating, they do not provide clear practical advantages over, say, backprop in feedforward networks with limited time windows (see crossreference Chapters 11 and 12). With conventional \&\#034;algorithms based on the computation of the complete gradient\&\#034;, such as \&\#034;Back-Propagation Through Time\&\#034; (BPTT, e.g., [22, 27, 26]) or \&\#034;Real-Time Recurrent Learning\&\#034; (RTRL, e.g., [21]) error signals \&\#034;flowing backwards in time\&\#034; tend to either (1) blow up or (2) vanish: the temporal evolution of the backpropagated error ex},
-    author = {Hochreiter, Sepp and Informatik, Fakultat F. and Bengio, Yoshua and Frasconi, Paolo and Schmidhuber, Jurgen},
-    citeulike-article-id = {4450697},
-    citeulike-linkout-0 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.7321},
-    keywords = {gradient-descent, long-term-dependencies, rnn},
-    posted-at = {2009-05-02 00:58:01},
-    priority = {2},
-    title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies},
-    url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.7321},
-    booktitle =    "Field Guide to Dynamical Recurrent Networks",
-    editor = "J. Kolen and S. Kremer",
-    publisher = "IEEE Press",
-    year = "2000",
-}
-
-@INPROCEEDINGS{Lecun99objectrecognition,
-    author = {Yann {LeCun} and Patrick Haffner and Léon Bottou and Yoshua Bengio},
-    title = {Object Recognition with Gradient-Based Learning},
-    booktitle = {Shape, Contour and Grouping in Computer Vision},
-    year = {1999},
-    publisher = {Springer},
-    pages = {319--345},
-}
-
-
-% non-ref conference
-@MISC{snowbird_learn_conf,
-title = "Snowbirds papers",
-author = "many authors",
-howpublished = "Learning Conference, Snowbird",
-location = "Utah",
-year = "many",
-}
-
-@MISC{Collobert+Bengio-2001,
-title = "Magic Mix",
-author = "Collobert, R. and Bengioy, Y.",
-year = "2002",
-howpublished = "Learning Conference, Snowbird",
-location = "Utah",
-}
-
-@MISC{Bengio+al-2001,
-title = "Learning a Distributed Representation for Statistical Language Modeling and Information Retrieval",
-author = "Yoshua Bengio and Pascal Vincent and Florence d'Alché-Buc",
-year = "2001",
-howpublished = "Learning Conference, Snowbird",
-location = "Utah",
-}
-
-@MISC{Bengio+Nadeau-2000,
-title = "About Realistic Comparisons Between Learning Algorithms",
-author = "Yoshua Bengio and C. Nadeau",
-year = "2000",
-howpublished = "Learning Conference, Snowbird",
-location = "Utah",
-}
-@MISC{Bengio-1999,
-title = "Learning from Structured High-Dimensional Data",
-author = "Yoshua Bengio",
-howpublished = "Meeting of the Mathematical Society of Canada",
-location = "Montreal, Canada",
-year = "1999",
-}
-
-@MISC{Bengio+al-1999,
-title = "Gradient-Based Learning of Hyper-Parameters",
-author = "Yoshua Bengio and S. Latendresse and Charles Dugas",
-year = "1999",
-howpublished = "Learning Conference, Snowbird",
-location = "Utah",
-}
-
-@MISC{Bengio+al-1999b,
-title = "Learning Algorithms for Sorting Compounds from Titration Curves",
-author = "Yoshua Bengio and J-J. Brault and F. Major and R. Neal and S. Pigeon",
-howpublished = "Symposium on New Perspectives for Computer-Aided Drug Design",
-location = "Montreal, Canada",
-year = "1999",
-}
-
-@MISC{Bengio+al-1998,
-title = "Stochastic learning of strategic equilibria for auctions",
-author = "Yoshua Bengio and S. Latendresse and Charles Dugas",
-howpublished = "Machines That Learn Conference, Snowbird",
-location = "Utah",
-year = "1998",
-}
-
-@MISC{Bengio+al-1997,
-title = "On the Clusterization of Probabilistic Transducers",
-author = "Bengio, Y. and Bengio, S. and Singer, Y. and Isabelle, J-F.",
-howpublished = "1997 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1997",
-}
-
-@MISC{Bengio-1995,
-title = "Fast High Capacity Classifiers",
-author = "Bengio, Y. and Bengio, S. and Singer, Y. and Isabelle, J-F.",
-howpublished = "1995 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1997",
-}
-
-@MISC{Bengio+Frasconi-1994,
-title = "Réseaux de neurones Markoviens pour l'inférence grammaticale",
-author = "Bengio, Y. and Frasconi, P.",
-howpublished = "1994 ACFAS Conference, neural networks colloquium",
-location = "Montréal, Québec",
-year = "1994",
-}
-
-@MISC{Bengio+LeCun-1994,
-title = "Reconnaissance de mots manuscrits avec réseaux de neurones et modèles de Markov",
-author = "Bengio, Y. and {LeCun}, Y.",
-howpublished = "1994 ACFAS Conference, neural networks colloquium",
-location = "Montréal, Québec",
-year = "1994",
-}
-
-@MISC{Bengio+al-1994,
-title = "Optimisation d'une règle d'apprentissage pour réseaux de neurones artificiels",
-author = "Bengio, S. and Bengio, Y. and Cloutier, J. and Gecsei, J.",
-howpublished = "1994 ACFAS Conference, neural networks colloquium",
-location = "Montréal, Québec",
-year = "1994",
-}
-
-@MISC{Bengio+Frasconi-1994b,
-title = "An {EM} Algorithm for Target Propagation",
-author = "Bengio, Y. and Frasconi P.",
-howpublished = "1994 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1994",
-}
-
-@MISC{Bengio+al-1993,
-title = "The Problem of Learning Long-Term Dependencies in Recurrent Networks",
-author = "Bengio, Y. and Simard, P. and Frasconi P.",
-howpublished = "1994 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1993",
-}
-@MISC{Bengio-1992,
-title = "Representations Based on Articulatory Dynamics for Speech Recognition",
-author = "Bengio, Y.",
-howpublished = "1992 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1992",
-}
-
-@MISC{Bengio+al-1991,
-title = "Learning a Synaptic Learning Rule",
-author = "Bengio, Y. and Bengio, S. and Cloutier, J.",
-howpublished = "1991 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1991",
-}
-
-@MISC{Bengio+DeMori-1990,
-title = "Recurrent networks with Radial Basis Functions for speech recognition",
-author = "Bengio, Y. and De Mori, R.",
-howpublished = "1990 Neural Networks for Computing Conference, Snowbird",
-location = "Utah",
-year = "1991",
-}
-
-
-%%tech repport
-@TechReport{Bardou+Bengio-TR2002,
-  author =       "O. Bardou and Yoshua Bengio",
-  title =        "Régularisation du prix des option : Stacking",
-  institution =  "Cahier Scientifique Cirano 2002s-44",
-  year =         "2002",
-}
-
-@TechReport{Dugas+Bengio-TR2002,
-  author =       "O. Bardou and Yoshua Bengio",
-  title =        "Étude du biais dans le prix des options",
-  institution =  "Cahier Scientifique Cirano 2002s-45",
-  year =         "2002",
-}
-
-@TechReport{Dugas+al-TR2002,
-  author =       "C. Dugas and Y. Bengio and F. Bélisle and C. Nadeau and R. Garcia",
-  title =        "Incorporating Second-Order Functional Knowledge for Better Option Pricing",
-  institution =  "Cahier Scientifique Cirano 2002s-46",
-  year =         "2002",
-}
-
-@TechReport{Bengio+al-TR2002,
-  author =       "Y. Bengio and V.-P. Lauzon and R. Ducharme",
-  title =        "Experiments on the Application of IOHMMs to Model Financial Returns Series",
-  institution =  "Cahier Scientifique Cirano 2002s-47",
-  year =         "2002",
-}
-
-@TechReport{Bengio+al-TR2002b,
-  author =       "Y. Bengio and R. Ducharme and O. Bardou and N. Chapados",
-  title =        "Valorisation d'options par optimisation du Sharpe Ratio",
-  institution =  "Cahier Scientifique Cirano 2002s-48",
-  year =         "2002",
-}
-
-@TechReport{Chapados+Bengio-TR2002,
-  author =       "N. Chapados and Y. Bengio",
-  title =        "Cost Functions and Model Combination for VaR-based Asset Allocation using
- Neural Networks",
-  institution =  "Cahier Scientifique Cirano 2002s-49",
-  year =         "2002",
-}
-
-@TechReport{Bengio+Dugas-TR2002,
-  author =       "Y. Bengio and C. Dugas",
-  title =        "Forecasting Non-Stationary Volatility with Hyper-Parameters",
-  institution =  "Cahier Scientifique Cirano 2002s-50",
-  year =         "2002",
-}
-
-@TechReport{Gingras+al-TR2002,
-  author =       "F. Gingras and Y. Bengio and C. Nadeau",
-  title =        "On Out-of-Sample Statistics for Time-Series",
-  institution =  "Cahier Scientifique Cirano 2002s-51",
-  year =         "2002",
-}
-
-@TechReport{Chapados+Bengio-TR2002b,
-  author =       "N. Chapados and Y. Bengio",
-  title =        "Input Decay : Simple and Effective Soft Variable Selection",
-  institution =  "Cahier Scientifique Cirano 2002s-52",
-  year =         "2002",
-}
-
-@TechReport{Ghosn+Bengio-TR2002,
-  author =       "J. Ghosn and Y. Bengio",
-  title =        "Multi-Task Learning For Option Pricing",
-  institution =  "Cahier Scientifique Cirano 2002s-53",
-  year =         "2002",
-}
-
-@TechReport{Collobert+al-TR2001,
-  author =       "J. Ghosn and Y. Bengio",
-  title =        "A Parallel Mixture of {SVM}s for Very Large Scale Problems",
-  institution =  "IDIAP",
-  location =     "Switzerland",
-  number =       "IDIAP-RR-01-12",
-  year =         "2001",
-}
-
-@TechReport{Vincent+Bengio-TR2001,
-  author =       "Vincent, P. and Bengio, Y.",
-  title =        "K-Local Hyperplane and Convex Distance Nearest Neighbor Algorithms",
-  institution =  DIRO,
-  location =     "Switzerland",
-  number =       "1197",
-  year =         "2001",
-}
-
-@TechReport{Chapados+al-TR2001,
-  author =       "Chapados, N. and Bengio, Y. and Vincent, P. and Ghosn, J. and Dugas, C. and Takeuchi, I. and Meng, L.",
-  title =        "Estimating Car Insurance Premia : a Case Study in High-Dimensional Data Inference",
-  institution =  DIRO,
-  number =       "1199",
-  year =         "2001",
-}
-
-@TechReport{Bengio+Chapados-TR2001,
-  author =       "Chapados, N. and Bengio, Y. and Vincent, P. and Ghosn, J. and Dugas, C. and Takeuchi, I. and Meng, L.",
-  title =        "Extending Metric-Based Model Selection and Regularization in the Absence of Unlabeled Data",
-  institution =  DIRO,
-  number =       "1200",
-  year =         "2001",
-}
-
-@TechReport{Nadeau+Bengio-TR1999,
-  author =       "Nadeau, C. and Bengio, Y.",
-  title =        "Inference and the Generalization Error",
-  institution =  "Cahier Scientifique Cirano 99s-25",
-  year =         "2002",
-}
-
-@TechReport{Gingras+al-TR1999,
-  author =       "Gingras, F. and Bengio, Y. and Nadeau, C.",
-  title =        "On Out-of-Sample Statistics for Financial Time-Series",
-  institution =  "Centre de Recherches Mathématiques, Université de Montreal",
-  number =       "2585",
-  year =         "1999",
-}
-
-@TechReport{Bengio-1998-TR,
-  author =       "Bengio, Y.",
-  title =        "Using a financial training criterion rather than a prediction criterion",
-  institution =  "Cahier Scientifique Cirano 98s-21",
-  year =         "1998",
-}
-
-@TechReport{Bengio+DeMori-1990-TR,
-  author =       "Bengio, Y. and De Mori, R.",
-  title =        "Some connectionist models and their application to speech recognition",
-  institution =  "School of Computer Science, McGill University",
-  number =       "TR-SOCS-90-12",
-  year =         "1990",
-}
-
-@article{becker+hinton:1993,
-    author = {Becker, S. and Hinton, G. E.},
-    title=  {Learning Mixture Models of Spatial Coherence},
-    journal={Neural Computation},
-    volume={5},
-    pages={267--277},
-    year={1993}
-}
-@article{berkes:2005,
-    author = {Berkes, Pietro and Wiskott, Laurenz},
-    title = {Slow Feature Analysis Yields a Rich Repertoire of Complex Cell Properties},
-    journal = {Journal of Vision},
-    ISSN = {1534-7362},
-    volume = {5},
-    number = {6},
-    pages = {579-602},
-    year = {2005},
-    month = {7},
-    URL = {http://journalofvision.org/5/6/9/},
-    eprint = {http://journalofvision.org/5/6/9/Berkes-2005-jov-5-6-9.pdf},
-}
-@inproceedings{hurri+hyvarinen:2003,
-    author={Hurri, J. and Hyv{\"a}rinen, A.},
-    title={Temporal Coherence, Natural Image Sequences, and the Visual Cortex.},
-    booktitle={Advances in Neural Information Processing Systems 15
-        ({NIPS*02})},
-    year={2003},
-    pages={141--148},
-}
-@article{wiskott:2002,
-    author =       "Laurenz Wiskott and Terrence Sejnowski",
-    year =         "2002",
-    title = {Slow Feature Analysis: Unsupervised Learning of Invariances},
-    journal =      "Neural Computation",
-    volume =       "14",
-    number =       "4",
-    pages =        "715--770",
-    url= {http://itb.biologie.hu-berlin.de/~wiskott/Publications/WisSej2002-LearningInvariances-NC.ps.gz},
-}
-
-@article{KouhPoggio2008,
-    author={Minjoon M. Kouh and Tomaso T. Poggio},
-    title={A Canonical Neural Circuit for Cortical Nonlinear Operations},
-    journal={Neural Computation},
-    volume={20},
-    number={6},
-    year={2008},
-    pages={1427-51},
-}
-@article{NykampRingach2002,
-    author={D. Q. Nykamp and D. L. Ringach},
-    title ={Full Identification of a Linear-Nonlinear System via Cross-Correlation Analysis},
-    journal = {Journal of Vision},
-    volume={2},
-    pages={1-11},
-    year={2002},
-}
-@incollection{cadieu+olshausen:2009,
-     title = {Learning Transformational Invariants from Natural Movies},
-      author = {Charles Cadieu and Bruno Olshausen},
-       booktitle = {Advances in Neural Information Processing Systems 21},
-        editor = {D. Koller and D. Schuurmans and Y. Bengio and L. Bottou},
-         pages = {209--216},
-          year = {2009},
-     publisher = {MIT Press}
-}
-@book{DayanAbbott2001,
-    author={Peter Dayan and L. F. Abbott},
-    title = {Theoretical Neuroscience},
-    publisher = {The {MIT} Press},
-    year = 2001,
-}
-
-@inproceedings{Chechik-MIR2008,
- author = {G. Chechik and E. Ie and M. Rehn and S. Bengio and D. Lyon},
- title = {Large-scale content-based audio retrieval from text queries},
- booktitle = {ACM International Conference on Multimedia Information Retrieval (MIR'08)},
- year = 2008,
-}
-
-@inproceedings{Bai-ECIR2009,
- author = {B. Bai and J. Weston and R. Collobert and D. Grangier},
- title = {Supervised Semantic Indexing},
- booktitle = { European Conference on Information Retrieval (ECIR'09)},
- year = 2009,
-}
-
-@article{Attwell+Laughlin-2001,
- author = {David Attwell and Simon B. Laughlin},
- title = {An energy budget for signaling in the grey matter of the brain},
- journal = {Journal of Cerebral Blood Flow And Metabolism},
- year =2001,
- volume = 21,
- pages = {1133--1145},
-}
-
-@article{Lennie-2003,
- author = {Peter Lennie},
- title = {The cost of cortical computation},
- journal = {Current Biology},
- year = 2003,
- month = {Mar 18},
- volume = {13},
- number = 6,
- pages = {493--497},
-}
-
-@inproceedings{LowdD2005,
- author = {Lowd, Daniel and Domingos, Pedro},
- title = {Naive Bayes models for probability estimation},
- booktitle = ICML05,
- editor = ICML05ed,
- year = {2005},
- pages = {529--536},
- location = {Bonn, Germany},
- publisher = ICML05publ,
- address = {New York, NY, USA},
- }
-
-@incollection{NairV2009,
- title = {Implicit Mixtures of Restricted Boltzmann Machines},
- author = {Vinod Nair and Geoffrey E Hinton},
- booktitle = NIPS21,
- editor = NIPS21ed,
- publisher = NIPS21publ,
- pages = {1145--1152},
- year = {2009}
-}
-
-@incollection{Goodfellow2009,
- title = {Measuring Invariances in Deep Networks},
- author = {Ian Goodfellow and Quoc Le and Andrew Saxe and Andrew Ng},
- booktitle = NIPS22,
- editor = NIPS22ed,
- pages = {646--654},
- year = {2009}
-}
-
-@incollection{Xiao2009,
- title = {Dual Averaging Method for Regularized Stochastic Learning and Online Optimization},
- author = {Lin Xiao},
- booktitle = {Advances in Neural Information Processing Systems 22},
- editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I. Williams and A. Culotta},
- pages = {2116--2124},
- year = {2009}
-}
-
-@incollection{Kwok2009,
- title = {Accelerated Gradient Methods for Stochastic Optimization and Online Learning},
- author = {Chonghai Hu and James Kwok and Weike Pan},
- booktitle = {Advances in Neural Information Processing Systems 22},
- editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I. Williams and A. Culotta},
- pages = {781--789},
- year = {2009}
-}
-
-@article{Nesterov83,
- author = {Yu Nesterov},
- title = {A method for unconstrained convex minimization problem with the rate of convergence $o(1/k^2)$}, 
- journal = {Doklady AN SSSR (translated as Soviet. Math. Docl.)}, 
- volume = 269,
- pages = {543--547}, 
- year = 1983,
-}
-
-@incollection{Bai2009,
- title = {Polynomial Semantic Indexing},
- author = {Bing Bai and Jason Weston and David Grangier and Ronan Collobert and Kunihiko Sadamasa and Yanjun Qi and Corinna Cortes and Mehryar Mohri},
- booktitle = {Advances in Neural Information Processing Systems 22},
- editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C.K.I. Williams and A. Culotta},
- pages = {64--72},
- year = {2009}
-}
-
-@incollection{Chechik2009,
- title = {An Online Algorithm for Large Scale Image Similarity Learning},
- author = {Gal Chechik and Uri Shalit and Varun Sharma and Samy Bengio},
- booktitle = {Advances in Neural Information Processing Systems 22},
- editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I. Williams and A. Culotta},
- pages = {306--314},
- year = {2009}
-}
-
-@incollection{Klampfl+Maass-2009,
- title = {Replacing supervised classification learning by Slow Feature Analysis in spiking neural networks},
- author = {Stefan Klampfl and Wolfgang Maass},
- booktitle = NIPS22,
- editor = NIPS22ed,
- pages = {988--996},
- year = {2009}
-}
-
-
-
-@Article{GrandvaletCanuBoucheron97,
-  author =       "Yves Grandvalet and Stéphane Canu and Stéphane Boucheron",
-  title =        "Noise Injection: Theoretical Prospects",
-  journal =      "Neural Computation",
-  volume =       "9",
-  number =       "5",
-  pages =        "1093--1108",
-  year =         "1997",
-}
-
-@Article{SietsmaDow91,
-  author =       "J. Sietsma and R. Dow",
-  title =        "Creating artificial neural networks that generalize",
-  journal =      "Neural Networks",
-  volume =       "4",
-  number =       "1",
-  pages =        "67--79",
-  year =         "1991",
-}
-
-@Article{HolmstromKoistinen92,
-  author =       "Lasse Holmström and Petri Koistinen",
-  title =        "Using additive noise in back-propagation training",
-  journal =      "{IEEE} Transactions on Neural Networks",
-  volume =       "3",
-  number =       "1",
-  pages =        "24--38",
-  year =         "1992",
-}
-
-@inproceedings{Baird90,
-    author = "H. Baird",
-    title = {Document image defect models},
-    year = 1990,
-    booktitle = "IAPR Workshop on Syntactic and Structural Pattern Recognition",
-    pages = "38--46",
-    address = "Murray Hill, NJ."
-}
-
-@TechReport{Poggio+Vetter92,
-  author =       "T. Poggio and T. Vetter",
-  title =        "Recognition and structure from one 2D model view: Observations on prototypes, object classes and symmetries",
-  number =       "A.I. Memo No. 1347",
-  institution =  "Artificial Intelligence Laboratory, Massachusetts Institute of Technology",
-  year =         "1992",
-}
-
-@INPROCEEDINGS{Scholkopf96invariances,
-    author = {Bernhard Sch{\"o}lkopf and Chris Burges and Vladimir Vapnik},
-    title = {Incorporating Invariances in Support Vector Learning Machines},
-    booktitle = {Lecture Notes in Computer Science (Vol 112), Artificial Neural Netowrks ICANN'96},
-    year = {1996},
-    editor = {C. von der Malsburg and W. von Seelen and J. C. Vorbrüggen and B. Sendhoff},
-    pages = {47--52},
-    publisher = {Springer}
-}
-
-@inproceedings{Cho+Saul09,
- title = {Kernel Methods for Deep Learning},
- author = {Youngmin Cho and Lawrence Saul},
- booktitle = NIPS22,
- editor = NIPS22ed,
- pages = {342--350},
- year = {2010},
- publisher = {NIPS Foundation},
-}
-
-
-@InProceedings{Linsker89,
-  author =       "R. Linsker",
-  editor =       NIPS1ed,
-  booktitle =    NIPS1,
-  title =        "An application of the principle of maximum information 
-preservation to linear systems",
-  publisher =    NIPS1publ,
-  year =         "1989",
-}
-
-@Article{An96AddingNoise,
-  author =       "Guozhong An",
-  title =        "The effects of adding noise during backpropagation training on a generalization performance",
-  journal =      "Neural Computation",
-  volume =       "8",
-  number =       "3",
-  pages =        "643--674",
-  year =         "1996",
-}
-
-@article{DruckerLeCun92,
-	author = {Harris Drucker and Yann LeCun},
-	title = {Improving generalisation performance using double back-propagation.},
-	journal = {IEEE Transactions on Neural Networks},
-	number = {6},
-	pages = {991--997},
-	volume = {3},
-	year = {1992}
-}
-
-@Article{BellSejnowski-97,
-  author =       "A. Bell and T. J. Sejnowski",
-  title =        "The independent components of natural scenes are edge filters",
-  journal =      "Vision Research",
-  volume =       "37",
-  pages =        "3327--3338",
-  year =         "1997",
-}
-
-
-@Article{Dokur1997,
-  author =       {Z\:{u}mray Dokur, Tamer \:{O}lmez, Ertugrul Yazgan, Okan K. Ersoy},
-  title =        {Detection of {ECG} waveforms by neural networks},
-  journal =      {Medical engineering & physics},
-  year =         {1997},
-  volume =    {19},
-  number =    {8},
-  pages =     {738--741},
-  month =     {October},
-}
-
-@Article{Hu1993,
-  author =       {Y. H. Hu and W. J. Tompkins and J. L. Urrusti and V. X. Afonso},
-  title =        {Applications of artificial neural networks for {ECG} signal detection and classification},
-  journal =      JEC,
-  year =         {1993},
-  volume =    {26s},
-  pages =     {66--73},
-}
-
-@Article{Unser1996,
-author = {M. Unser and A. Aldroubi},
-title = {A Review of Wavelets in Biomedical Applications},
-journal = {Proceedings of the {IEEE}},
-year = {1996},
-volume= {84},
-number= {4},
-pages = {626--638},
-month = {April},
-}
-
-@inproceedings{Povey+Woodland-2002,
- author = {D. Povley and P.C. Woodland},
- title = {Minimum error and {I}-smoothing for improved discriminative training},
- booktile = {Proceedings of the International Conference on Acoustics,
-Speech, and Signal Processing (ICASSP'2002)},
- publisher = {IEEE},
- volume = 1,
- pages = {I-105--I-108},
- address = {Orlando, Florida, USA},
-}
-
-@incollection{Susskind2008,
- author = {Joshua M. Susskind and Geoffrey E. and Javier R. Movellan and Adam K. Anderson},
- title = {Generating Facial Expressions with Deep Belief Nets},
- editor = {V. Kordic},
- booktitle = {Affective Computing, Emotion Modelling, Synthesis and Recognition},
- publisher = {ARS Publishers},
- year = 2008,
- pages = {421--440},
-}
-
-@InCollection{Li2005,
-  author =       {Peng Li and Kap Luk Chan and Sheng Fu and S.M. Krishnan},
-  title =        {An Abnormal {ECG} Beat Detection Approach for Long-Term Monitoring of Heart Patients Based on Hybrid Kernel Machine Ensemble},
-  booktitle =    {Multiple Classifier Systems},
-  pages =     {346-355},
-  publisher = {Springer},
-  year =      {2005},
-  volume =    {3541/2005},
-  series =    {Lecture Notes in Computer Science},
-  address =   {Berlin / Heidelberg},
-}
-
-@incollection {Hughes_NIPS2003,
-  author = " Nicholas P. Hughes and  Lionel Tarassenko and  Stephen J. Roberts",
-  title = " Markov Models for Automated {ECG} Interval Analysis",
-  booktitle = NIPS16,
-  editor = NIPS16ed,
-  publisher = NIPS16publ,
-  address = NIPS16addr,
-  year = "2004",
-  keywords = "hidden Markov models, Markov models, wavelets, segmentation, probabilistic models, biomedical signal processing, time series",
-  }
-
-@inproceedings{Salem2009,
- author = {Abdel-Badeeh M. Salem and Kenneth Revett and El-Sayed A. El-Dahshan},
- title = {Machine Learning in Electrocardiogram Diagnosis},
- booktitle = {Proceedings of the International Multiconference on Computer Science and Information Technology},
- volume = 4,
- pages = {429--433},
- year = 2009,
- publisher = {IEEE},
-}
-
-@book{Clifford2006,
- author = {G.D. Clifford and F. Azuaje and P.E. McSharry}, 
- title = {Advanced Methods and Tools for {ECG} Analysis},
- publisher = {Artech House Publishing},
- year = 2006,
-}
-
-@inproceedings{Lin2009,
-  author = {Lin, Jessica and Li, Yuan}, 
-  title = {Finding Structural Similarity in Time Series Data Using Bag-of-Patterns Representation},
-  booktitle = {SSDBM 2009: Proceedings of the 21st International Conference on Scientific and Statistical Database Management},
-  year = {2009},
-  isbn = {978-3-642-02278-4},
-  pages = {461--477},
-  location = {New Orleans, LA, USA},
-  doi = {http://dx.doi.org/10.1007/978-3-642-02279-1_33},
-  publisher = {Springer-Verlag},
-  address = {Berlin, Heidelberg},
- }
-
-@article{Froese2006,
- author = {Froese, Tom and Hadjiloucas, Sillas and Galv\,
-{a}o, Roberto K. H. and Becerra, Victor M. and Coelho, Clarimar Jos\'{e}},
- title = {Comparison of extrasystolic {ECG} signal classifiers using discrete wavelet transforms},
- journal = {Pattern Recogn. Lett.},
- volume = {27},
- number = {5},
- year = {2006},
- issn = {0167-8655},
- pages = {393--407},
- doi = {http://dx.doi.org/10.1016/j.patrec.2005.09.002},
- publisher = {Elsevier Science Inc.},
- address = {New York, NY, USA},
- }
-
-@Article{Crowe1992,
-  author =   {J. A. Crowe and N. M. Gibson and M. S. Woolfson and M. G. Somekh},
-  title =    {Wavelet transform as a potential tool for {ECG} analysis and compression},
-  journal =  {Journal of Biomedical Engineering},
-  year =     {1992},
-  volume =   {14},
-  number =   {3},
-  pages =    {268--272},
-  month =    {May},
-}
-
-@ARTICLE{Hilton1997,
-    author = {Michael Hilton},
-    title = {Wavelet and Wavelet Packet Compression of Electrocardiograms},
-    journal = IEEE_trans_biomed,
-    year = {1997},
-    volume = {44},
-    pages = {394--402}
-}
-
-@Article{Li1995,
-  author =       {C. Li and C. Zheng and C. Tai},
-  title =        {Detection of {ECG} characteristic points using wavelet transforms},
-  journal =     IEEE_trans_biomed,
-  year =        {1995},
-  volume =    {42},
-  number =    {1},
-  pages =     {21--28},
-  month =     {January},
-}
-
-@article{Polat2007,
-title = {Detection of {ECG} Arrhythmia using a differential expert system approach based on principal component analysis and least square support vector machine},
-journal = {Applied Mathematics and Computation},
-volume = {186},
-number = {1},
-pages = {898--906},
-year = {2007},
-issn = {0096-3003},
-doi = {DOI: 10.1016/j.amc.2006.08.020},
-url = {http://www.sciencedirect.com/science/article/B6TY8-4KXDWBF-5/2/a9e1d7e2dfc4c88935386ea04ca9cb94},
-author = {Kemal Polat and Salih G\"{u}nes},
-keywords = {ECG Arrhythmia},
-keywords = {Principal component analysis (PCA)},
-keywords = {Least square support vector machine (LSSVM)},
-keywords = {ROC curves},
-}
-
-@article{Song2005,
-  author =       {Mi Hye Song and Jeon Lee and Sung Pil Cho and Kyoung Joung Lee and Sun Kook Yoo},
-  title =        {Support Vector Machine Based Arrhythmia Classification  
-Using Reduced Features},
-  journal =      IJCAS,
-  year =         {2005},
-  volume =    {3},
-  number =    {4},
-  pages =     {571--579},
-  month =     {December},
-}
-
-@article{Ubeyli2009,
- author = {Elif Derya \"{U}beyli},
- title = {Combining recurrent neural networks with eigenvector methods for classification of {ECG} beats},
- journal = DSP,
- volume = {19},
- number = {2},
- year = {2009},
- issn = {1051-2004},
- pages = {320--329},
- doi = {http://dx.doi.org/10.1016/j.dsp.2008.09.002},
- publisher = {Academic Press, Inc.},
- address = {Orlando, FL, USA},
- }
-
-@article{Ubeyli2007,
-  author =       {Elif Derya \"{U}beyli},
-  title =        {{ECG} beats classification using multiclass support vector machines with error correcting output codes},
-  journal =      DSP,
-  year =         {2007},
-  volume =    {17},
-  pages =     {675--684},
-}
-
-@Article{Soman2005,
-  author =    {T. Soman and P. O. Bobbie},
-  title =     {Classification of Arrhythmia Using Machine Learning Techniques},
-  journal =   {WSEAS Transactions on Computers},
-  year =      {2005},
-  volume =    {4},
-  number =    {6},
-  pages =     {548--552},
-  month =     {June},
-}
-
-@InProceedings{Chengwei2006,
-  author =       {Li Chengwei and Wang Shoubin and Xu Aijun and Peng Hui},
-  title =        {Clinical Diagnosis of Cardiac Disease Based on Support Vector Machine},
-  booktitle = {World Congress on Medical Physics and Biomedical Engineering},
-  pages =     {1273--1276},
-  year =      {2006},
-  editor =    {R. Magjarevic and J. H. Nagel},
-  volume =    {14},
-  series =    {IFMBE Proceedings},
-  publisher = {Springer Berlin Heidelberg},
-}
-
-@Article{Chiu2005,
-  author =       {Chuang-Chien Chiu and Tong-Hong Lin and Ben-Yi Liau},
-  title =        {Using correlation coefficient in {ECG} waveform for arrhythmia detection},
-  journal =      BME,
-  year =         {2005},
-  volume =    {17},
-  number =    {3},
-  pages =     {147--152},
-  month =     {June},
-}
-
-@Article{Silipo1998,
-  author =       {Rosaria Silipo and Carlo Marchesi},
-  title =        {Artificial Neural Networks for Automatic {ECG} Analysis},
-  journal =      IEEE_trans_SP,
-  year =         {1998},
-  volume =    {46},
-  number =    {5},
-  pages =     {1417--1425},
-  month =     {May},
-}
-
-@Article{Osowski2004,
-  author =       {Stanislaw Osowski and Linh Tran Hoai and Tomasz Markiewicz},
-  title =        {Support Vector Machine-Based Expert System for 
-Reliable Heartbeat Recognition},
-  journal =      IEEE_trans_biomed,
-  year =         {2004},
-  volume =    {51},
-  number =    {4},
-  pages =     {582--589},
-  month =     {April},
-}
-
-@article{PhysioNet,
- author = PhysioNetAuthors,
- title = "{PhysioBank, PhysioToolkit, and PhysioNet}: Components of a New
-	  Research Resource for Complex Physiologic Signals",
- journal = "Circulation",
- year = PhysioNetYear,
- volume = "101",
- number = "23",
- pages = "e215--e220",
- note = PhysioNetNote,
-}
-
-@article{Lin2007,
-    author = {Lin, Jessica and Keogh, Eamonn and Wei, Li and Lonardi, Stefano},
-    citeulike-article-id = {2821475},
-    citeulike-linkout-0 = {http://dblp.uni-trier.de/rec/bibtex/journals/datamine/LinKWL07},
-    citeulike-linkout-1 = {http://dx.doi.org/10.1007/s10618-007-0064-z},
-    citeulike-linkout-2 = {http://www.springerlink.com/content/g69808822l82t325},
-    day = {18},
-    doi = {10.1007/s10618-007-0064-z},
-    journal = DMKD,
-    keywords = {simulation},
-    month = {October},
-    number = {2},
-    pages = {107--144},
-    posted-at = {2008-05-21 23:56:04},
-    priority = {2},
-    title = {Experiencing SAX: a novel symbolic representation of time series},
-    url = {http://dx.doi.org/10.1007/s10618-007-0064-z},
-    volume = {15},
-    year = {2007}
-}
-
-@inproceedings{Lin2010,
-  author = {Lin, Jessica and Li, Yuan},
-  title = {Finding Structural Similarity in Time Series Data Using Bag-of-Patterns Representation},
-  booktitle = SSDBM2009, 
-  year = {2009},
-  isbn = {978-3-642-02278-4},
-  pages = {461--477},
-  location = {New Orleans, LA, USA},
-  doi = {http://dx.doi.org/10.1007/978-3-642-02279-1_33},
-  publisher = {Springer-Verlag},
-  address = {Berlin, Heidelberg},
- }
-
-@Article{Ham1996,
-  author =       {F. M. Ham and Soowhan Han},
-  title =        {Classification of cardiac arrhythmias using fuzzy ARTMAP},
-  journal =      IEEE_trans_biomed,
-  year =         {1996},
-  volume =    {43},
-  number =    {4},
-  pages =     {425--429},
-  month =     {April},
-}
-@article{Engin2004,
-  title = "ECG beat classification using neuro-fuzzy network",
-  journal = PRL,
-  volume = "25",
-  number = "15",
-  pages = "1715 - 1722",
-  year = "2004",
-  issn = "0167-8655",
-  doi = "DOI: 10.1016/j.patrec.2004.06.014",
-  url = "http://www.sciencedirect.com/science/article/B6V15-4D0Y5TH-2/2/b83f364f61d79f96abeb1bc1b1898ab9",
-  author = "Mehmet Engin",
-  keywords = "ECG beat classification",
-  keywords = "MIT/BIH database",
-  keywords = "Neuro-fuzzy networks",
-  keywords = "Higher-order statistics",
-  keywords = "Wavelet transform",
-  keywords = "AR modelling",
-  keywords = "Pattern recognition"
-}
-
-@article{Turaga2010,
- author = {S. C. Turaga and J. F. Murray and V. Jain and F. Roth and M. Helmstaedter and K. Briggman and W. Denk and H. S. Seung}, 
- title = {Convolutional networks can learn to generate affinity graphs for image segmentation}, 
- journal = {Neural Computation}, 
- volume = 22, 
- pages = {511--538},
- year = 2010,
-}
-
-@article{Hahnloser-2003,
- author = {Richard H.R. Hahnloser and H. Sebastian Seung and J.J. Slotine},
- title = {Permitted and forbidden sets in symmetric threshold-linear networks},
- journal = {Neural Computation},
- volume = 15,
- pages = {621--638},
- year = 2003,
-}
-
-@techreport{Jenatton-2009,
- title={Structured Variable Selection with Sparsity-Inducing Norms},
- author={Jenatton, R. and Audibert, J.-Y. and Bach, F.},
- institution={arXiv:0904.3523},
- year={2009}
-}
-
-@ARTICLE{Erhan2010,
-    author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
-     month = feb,
-     title = {Why Does Unsupervised Pre-training Help Deep Learning?},
-   journal = jmlr,
-    volume = {11},
-      year = {2010},
-     pages = {625--660},
-  abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
-}
-
-@ARTICLE{Bengio2009FTML,
-    author = {Bengio, Yoshua},
-     title = {Learning deep architectures for {AI}},
-   journal = FTML,
-    volume = {2},
-    number = {1},
-      year = {2009},
-     pages = {1--127},
-      note = Bengio2009FTML_note,
-  abstract = {Theoretical results suggest that in order to learn the kind of
-complicated functions that can represent high-level abstractions (e.g. in
-vision, language, and other AI-level tasks), one may need {\insist deep
-architectures}. Deep architectures are composed of multiple levels of non-linear
-operations, such as in neural nets with many hidden layers or in complicated
-propositional formulae re-using many sub-formulae. Searching the
-parameter space of deep architectures is a difficult task, but
-learning algorithms such as those for Deep Belief Networks have recently been proposed
-to tackle this problem with notable success, beating the state-of-the-art
-in certain areas. This paper discusses the motivations and principles regarding 
-learning algorithms for deep architectures,  in particular those exploiting as
-building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines,
-used to construct deeper models such as Deep Belief Networks.}
-}
-
-@ARTICLE{Bengio1994ITNN,
-    author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
-     title = {Learning Long-Term Dependencies with Gradient Descent is Difficult},
-   journal = IEEE_trans_NN,
-    volume = {5},
-    number = {2},
-      year = {1994},
-     pages = {157--166},
-  abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.},
-optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J},
-}
-
-@article{Kohler1992,
-    abstract = {The QRS complex is the most striking waveform within the electrocardiogram (ECG). Since it reflects the electrical activity within the heart during the ventricular contraction, the time of its occurrence as well as its shape provide much information about the current state of the heart. Due to its characteristic shape it serves as the basis for the automated determination of the heart rate, as an entry point for classification schemes of the cardiac cycle, and often it is also used in ECG data compression algorithms. In that sense, QRS detection provides the fundamentals for almost all automated ECG analysis algorithms. Software QRS detection has been a research topic for more than 30 years. The evolution of these algorithms clearly reflects the great advances in computer technology. Within the last decade many new approaches to QRS detection have been proposed; for example, algorithms from the field of artificial neural networks genetic algorithms wavelet transforms, filter banks as well as heuristic methods mostly based on nonlinear transforms. The authors provide an overview of these recent developments as well as of formerly proposed algorithms},
-    author = {Kohler, B. U. and Hennig, C. and Orglmeister, R.},
-    citeulike-article-id = {546409},
-    citeulike-linkout-0 = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=993193},
-    journal = eng_med_bio,
-    keywords = {detector, ecg\_processing, qrs, qt\_interval, review\_article, rr\_interval},
-    number = {1},
-    pages = {42--57},
-    posted-at = {2007-11-25 20:38:19},
-    priority = {2},
-    title = {The principles of software QRS detection},
-    url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=993193},
-    volume = {21},
-    year = {2002}
-}
-
-@article{Thomas2006,
-author = {Julien Thomas and Cedric Rose and Francois Charpillet},
-title = {A Multi-HMM Approach to ECG Segmentation},
-journal = ICTAI06, 
-volume = {0},
-issn = {1082-3409},
-year = {2006},
-pages = {609-616},
-doi = {http://doi.ieeecomputersociety.org/10.1109/ICTAI.2006.17},
-publisher = {IEEE Computer Society},
-address = {Los Alamitos, CA, USA},
-}
-
-@inproceedings{Cortes+al-2000,
- author = {Juan Carlos P\'{e}rez-Cortes and Rafael Llobet and Joaquim Arlandis},
- title = {Fast and Accurate Handwritten Character Recognition Using Approximate Nearest Neighbours Search on Large Databases},
- booktitle = {Proceedings of the Joint IAPR International Workshops on Advances in Pattern Recognition},
- year = {2000},
- isbn = {3-540-67946-4},
- pages = {767--776},
- publisher = {Springer-Verlag},
- address = {London, UK},
- }
-
-
-@Article{Oliveira+al-2002,
-  author =       "Oliveira, L.S.  and  Sabourin, R.  and  Bortolozzi, F.  and  Suen, C.Y.",
-  title =        "Automatic recognition of handwritten numerical strings: a recognition and verification strategy",
-  journal =      "IEEE Transactions on Pattern Analysis and Machine
-                 Intelligence",
-  volume =       "24",
-  number =       "11",
-  pages =        "1438-1454",
-  month =        nov,
-  year =         "2002",
-  doi  =         "10.1109/TPAMI.2002.1046154",
-  issn =         "0162-8828",
-}
-
-@inproceedings{SimardSP03,
-  author    = {Patrice Simard and
-               David Steinkraus and
-               John C. Platt},
-  title     = {Best Practices for Convolutional Neural Networks Applied
-               to Visual Document Analysis},
-  booktitle = {ICDAR},
-  year      = {2003},
-  pages     = {958-962},
-  ee        = {http://csdl.computer.org/comp/proceedings/icdar/2003/1960/02/196020958abs.htm},
-  crossref  = {DBLP:conf/icdar/2003},
-  bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-@inproceedings{Milgram+al-2005,
-  author = {Milgram, J. and Cheriet, M. and Sabourin, R.},
-  title = {Estimating accurate multi-class probabilities with support vector machines},
-  booktitle = {Int. Joint Conf. on Neural Networks},
-  year = {2005},
-  pages = {906--1911},
-  location = {Montreal, Canada},
- }
-
-@proceedings{DBLP:conf/icdar/2003,
-  title     = {7th International Conference on Document Analysis and Recognition
-               (ICDAR 2003), 2-Volume Set, 3-6 August 2003, Edinburgh,
-               Scotland, UK},
-  booktitle = {ICDAR},
-  publisher = {IEEE Computer Society},
-  year      = {2003},
-  isbn      = {0-7695-1960-1},
-  bibsource = {DBLP, http://dblp.uni-trier.de}
-}
-
-
-@article{Granger+al-2007,
-    author = {Eric Granger and Robert Sabourin and Luiz S. Oliveira and Catolica Parana},
-    title = {Supervised Learning of Fuzzy ARTMAP Neural Networks Through Particle Swarm Optimization},
-    journal = {Journal of Pattern Recognition Research},
-    year = {2007},
-    volume = "2",
-    number = "1",
-    pages = "27-60",
-}
-
-@inproceedings{SnowEtAl2008,
-    author = {Snow, R. and O'Connor, B. and Jurafsky, D. and Ng, A.},
-    booktitle = {Proc. Empirical Methods in NLP},
-    pages = {254--263},
-    title = {Cheap and Fast -- But is it Good? Evaluating Non-Expert Annotations for Natural Language Tasks},
-    year = {2008}
-}
-
-@TECHREPORT{Garris94+al-1994,
-    author = {Michael D. Garris and James L. Blue and Gerald T. Candela and Gerald T. C and Darrin L. Dimmick and Jon Geist and Patrick J. Grother and Stanley A. Janet and Charles L. Wilson},
-    title = {NIST Form-Based Handprint Recognition System},
-    institution = {Technical Report NISTIR 5469 and CD-ROM, National Institute of Standards and Technology},
-    year = {1994},
-    doi = {10.1.1.45.1560},
-}
-
-@inproceedings{SorokinAndForsyth2008,
-    author = {Sorokin, A. and Forsyth, D.},
-    booktitle = {CVPR Workshops},
-    pages = {1--8},
-    title = {Utility data annotation with Amazon Mechanical Turk},
-    year = {2008}
-}
-
-@inproceedings{Grother-1995,
-        AUTHOR = "Grother, P.J.",
-        TITLE = "Handprinted Forms and Character Database, NIST Special Database 19",
-        BOOKTITLE = "National Institute of Standards and Technology (NIST) Intelligent Systems Division (NISTIR)",
-        YEAR = "1995",
-        BIBSOURCE = "http://www.visionbib.com/bibliography/char1015.html#TT105853"}
-}
-
-@inproceedings{ whitehill09,
- title = {Whose Vote Should Count More: Optimal Integration of Labels from Labelers of Unknown Expertise},
- author = {J. Whitehill and P. Ruvolo and T. Wu and J. Bergsma and J. Movellan},
- booktitle = {NIPS 22},
- pages = {2035--2043},
- year = 2009
-}
--- a/writeup/nips2010_submission.tex	Tue Jun 01 12:12:52 2010 -0400
+++ b/writeup/nips2010_submission.tex	Tue Jun 01 12:13:10 2010 -0400
@@ -85,10 +85,10 @@
 that are unlabeled and/or come from a distribution different from the target
 distribution, e.g., from other classes that those of interest. Whereas
 it has already been shown that deep learners can clearly take advantage of
-unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008}
+unsupervised learning and unlabeled examples~\citep{Bengio-2009,WestonJ2008-small}
 and multi-task learning, not much has been done yet to explore the impact
 of {\em out-of-distribution} examples and of the multi-task setting
-(but see~\citep{CollobertR2008-short}). In particular the {\em relative
+(but see~\citep{CollobertR2008}). In particular the {\em relative
 advantage} of deep learning for this settings has not been evaluated.
 
 In this paper we ask the following questions:
@@ -172,7 +172,7 @@
 \times complexity]$ and $c$ and $f$ $\sim U[-4 \times complexity, 4 \times
 complexity]$.\\
 {\bf Local Elastic Deformations.}
-This filter induces a "wiggly" effect in the image, following~\citet{SimardSP03},
+This filter induces a "wiggly" effect in the image, following~\citet{SimardSP03-short},
 which provides more details. 
 Two "displacements" fields are generated and applied, for horizontal
 and vertical displacements of pixels. 
@@ -612,9 +612,9 @@
 A Flash demo of the recognizer (where both the MLP and the SDA can be compared) 
 can be executed on-line at {\tt http://deep.host22.com}.
 
-
-{\small
-\bibliography{strings,ml,aigaion,specials}
+\newpage
+{
+\bibliography{strings,strings-short,strings-shorter,ift6266_ml,aigaion-shorter,specials}
 %\bibliographystyle{plainnat}
 \bibliographystyle{unsrtnat}
 %\bibliographystyle{apalike}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/strings-short.bib	Tue Jun 01 12:13:10 2010 -0400
@@ -0,0 +1,233 @@
+@String{cogsci = "Cognitive Science"}
+
+@String{AI06 = "AI 2006"}
+
+@String{JSM02="Proc. of JSM 2002"}
+
+@STRING{NIPS = "NIPS"}
+
+@STRING{NIPS1ed = "D.S.~Touretzky"}
+
+@String{nips87 = "NIPS'87"}
+
+@String{nips87ed = "D. Z. Anderson"}
+
+@STRING{NIPS1   = "Advances in NIPS'88"}
+
+@STRING{NIPS2ed = "D.S.~Touretzky"}
+
+@STRING{NIPS2   = "NIPS'89"}
+
+@STRING{NIPS3ed = "R.P.~Lippman and J.M.~Moody and D.S.~Touretzky"}
+
+@STRING{NIPS3   = "NIPS'90"}
+
+@STRING{NIPS4ed = "J.E. Moody S.J. Hanson and R.P. Lippmann"}
+
+@STRING{NIPS4   = "NIPS'91"}
+
+@STRING{NIPS5ed = "C.L.~Giles and S.J.~Hanson and J.D.~Cowan"}
+
+@STRING{NIPS5   = "NIPS'92"}
+
+@STRING{NIPS6ed = "D. Cowan and G. Tesauro and J. Alspector"},
+
+@STRING{NIPS6   = "NIPS'93"}
+
+@STRING{NIPS7ed = "G.~Tesauro and D.S.~Touretzky and T.K.~Leen"}
+
+@STRING{NIPS7   = "NIPS'94"}
+
+@STRING{NIPS8ed = "D.S.~Touretzky and M.C.~Mozer and M.E.~Hasselmo"}
+
+@STRING{NIPS8   = "NIPS'95"}
+
+@STRING{NIPS9ed = "M.C.~Mozer and M.I.~Jordan and T.~Petsche"}
+
+@STRING{NIPS9   = "NIPS'96"}
+
+@STRING{NIPS10ed = "M.I.~Jordan and M.J.~Kearns and S.A.~Solla"}
+
+@STRING{NIPS10  = "NIPS'97"}
+
+@STRING{NIPS11ed = "M.S.~Kearns and S.A.~Solla and D.A.~Cohn"}
+
+@STRING{NIPS11  = "NIPS'98"}
+
+@STRING{NIPS12ed = "S.A.~Solla and T.K.~Leen and K-R.~M{\"u}ller"}
+
+@STRING{NIPS12  = "NIPS'99"}
+
+@STRING{NIPS13ed = "T.K.~Leen and T.G.~Dietterich and V.~Tresp"}
+
+@STRING{NIPS13  = "NIPS'00"}
+
+@STRING{NIPS14ed = "T.G.~Dietterich and S.~Becker and Z.~Ghahramani"}
+
+@STRING{NIPS14  = "NIPS'01"}
+
+@STRING{NIPS15ed = "S.~Becker and S.~Thrun and K.~Obermayer"}
+
+@STRING{NIPS15  = "NIPS'02"}
+
+@STRING{NIPS16ed = ""}
+@string{NIPS16publ = {}}
+@STRING{NIPS16  = "NIPS'03"}
+@string{NIPS16addr = ""}
+
+@STRING{NIPS17ed = "L.K.~Saul and Y.~Weiss and L.~Bottou"}
+
+@STRING{NIPS17  = "NIPS'04"}
+
+@STRING{NIPS18 = "NIPS'05"}
+
+@STRING{NIPS18ed = "Y. Weiss and B. Sch{\"o}lkopf and J. Platt"}
+
+@STRING{NIPS19 = "NIPS'06"}
+
+@STRING{NIPS19ed = "B. Sch{\"o}lkopf and J. Platt and T. Hoffman"}
+
+@STRING{NIPS20 = "NIPS'07"}
+
+@STRING{NIPS20ed = "J.C. Platt and D. Koller and Y. Singer and S. Roweis"}
+
+@STRING{NIPS21 = "NIPS'08"}
+@STRING{NIPS22 = "NIPS'09"}
+
+@STRING{NIPS21ed = "D. Koller and D. Schuurmans and Y. Bengio and L. Bottou"} 
+
+@String{nips87 = "NIPS"}
+
+@String{nips89 = "NIPS'89"}
+
+@String{nips89eds = "D.S. Touretzky"}
+
+@String{nips90 = "NIPS'90"}
+
+@String{nips90eds = "D.S. Touretzky"}
+
+@String{nips91 = "NIPS'91"}
+
+@String{nips91eds = "R. P. Lippman and R. Moody and D. S. Touretzky"}
+
+@String{nips92 = "NIPS'92"}
+
+@String{nips92eds = "Moody, J.E. and S.J. Hanson and R.P. Lipmann"}
+
+@String{nips93 = "NIPS'93"}
+
+@String{nips93eds = "S. J. Hanson and J. D. Cowan and C. L. Giles"}
+
+@String{nips94 = "NIPS'94"}
+
+@String{nips94eds = "J.D. Cowan and G. Tesauro and J. Alspector"}
+
+@String{nips95 = "NIPS'95"}
+
+@String{nips95eds = "G. Tesauro and D.S. Touretzky and T.K. Leen"}
+
+@String{nips96 = "NIPS'96"}
+
+@String{nips96eds = "M. Mozer and D.S. Touretzky and M. Perrone"}
+
+@String{nips97 = "NIPS'97"}
+ 
+@String{nips97eds = "M. Jordan and M. Mozer and T. Petsche"}
+ 
+@String{nips98 = "NIPS'98"}
+ 
+@String{nips98eds = "S. Solla and M. Jordan"}
+ 
+@String{nips2001 = "NIPS'01"}
+ 
+@String{nips2002 = "NIPS'02"}
+
+@String{nips2002eds = "T. G. Dietterich and S. Becker and Z. Ghahramani"}
+
+@String{nips2002publ = "MIT Press, Cambridge, {MA}"}
+
+@String{nips2003 = "NIPS'03"}
+
+@String{iapr =  "Proc. IAPR"}
+@String{ijprai =   "Int. J. Pattern Recognition and AI"}
+@String{jprr = "J. of Pat. Reco. Research"}
+
+
+@String{ICDAR03 =  "Proc. {ICDAR}'03"}
+@String{ICDAR07 =  "Proc. {ICDAR}'07"}
+
+@String{ICML96 = "Proc. {ICML} 1996"}
+@String{ICML97 = "Proc. {ICML} 1997"}
+@String{ICML98 = "Proc. {ICML} 1998"}
+@String{ICML99 = "Proc. {ICML} 1999"}
+@String{ICML00 = "Proc. {ICML} 2000"}
+@String{ICML01 = "Proc. {ICML} 2001"}
+@String{ICML02 = "Proc. {ICML} 2002"}
+@String{ICML03 = "Proc. {ICML} 2003"}
+@String{ICML04 = "Proc. {ICML} 2004"}
+@String{ICML05 = "Proc. {ICML} 2005"}
+@String{ICML06 = "Proc. {ICML} 2006"}
+@String{ICML07 = "Proc. {ICML} 2007"}
+@String{ICML08 = "Proc. {ICML} 2008"}
+@String{ICML09 = "Proc. {ICML} 2009"}
+@String{ICML96ed = ""}
+@String{ICML97ed = ""}
+@String{ICML98ed = ""}
+@String{ICML99ed = ""}
+@String{ICML00ed = ""}
+@String{ICML01ed = ""}
+@String{ICML02ed = ""}
+@String{ICML03ed = ""}
+@String{ICML04ed = ""}
+@String{ICML05ed = ""}
+@String{ICML06ed = ""}
+@String{ICML07ed = ""}
+@String{ICML08ed = ""}
+@String{ICML09ed = ""}
+@String{ICML96publ = ""}
+@String{ICML97publ = ""}
+@String{ICML98publ = ""}
+@String{ICML99publ = ""}
+@String{ICML00publ = ""}
+@String{ICML01publ = ""}
+@String{ICML02publ = ""}
+@String{ICML03publ = ""}
+@String{ICML04publ = ""}
+@String{ICML05publ = ""}
+@String{ICML06publ = ""}
+@String{ICML07publ = ""}
+@String{ICML08publ = ""}
+@String{ICML09publ = ""}
+
+@STRING{aistats05 = "Proc. AISTATS'2005"}
+@STRING{aistats07 = "Proc. AISTATS'2007"}
+@STRING{aistats09 = "Proc. AISTATS'2009"}
+@String{SVM02 = "SVM2002"}
+
+@String{cvpr83 =  "Proc. {CVPR}'83"}
+@String{cvpr96 =  "Proc. {CVPR}'96"}
+@String{cvpr97 =  "Proc. {CVPR}'97"}
+@String{cvpr99 =  "Proc. {CVPR}'99"}
+@String{cvpr04 =  "Proc. {CVPR}'04"}
+@String{cvpr05 =  "Proc. {CVPR}'05"}
+@String{cvpr06 =  "Proc. {CVPR}'06"}
+@String{cvpr07 =  "Proc. {CVPR}'07"}
+@String{cvpr08 =  "Proc. {CVPR}'08"}
+@String{cvpr09 =  "Proc. {CVPR}'09"}
+
+@string{IEEE_trans_biomed = "{IEEE} Trans. Biomed. Eng."}
+@string{IJCAS = "{IJCAS}"}
+@string{DSP = "Digital Signal Process."}
+@string{IEEE_trans_SP = "{IEEE} Trans. Signal Process."}
+@string{BME = "BME"}
+@string{SSDBM2009 = "{SSDBM} 2009"}
+@string{PRL = "Pattern Recognit. Lett."}
+@string{jmlr = "{JMLR}"}
+@string{FTML = "Found. Trends Mach. Learn."}
+@string{Bengio2009FTML_note = ""}
+@string{JEC = "J. Electrocardiol."}
+@string{DMKD = "Data Min. Knowl. Disc."}
+@string{IEEE_trans_NN = "IEEE Trans. Neural Networks"}
+@string{eng_med_bio = "IEEE Eng. Med. Biol. Mag."} 
+@string{ICTAI06 = "ICTAI'06"}
+@String{ieeetpami = "IEEE Trans. Pattern Analysis and Mach. Intelli."}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/strings-shorter.bib	Tue Jun 01 12:13:10 2010 -0400
@@ -0,0 +1,110 @@
+@String{cogsci = "Cognitive Science"}
+
+@String{AI06 = "AI 2006"}
+
+@String{JSM02="Proc. of JSM 2002"}
+
+@STRING{NIPS = "NIPS"}
+
+@STRING{NIPS1ed = ""}
+
+@String{nips87 = "NIPS'87"}
+
+@String{nips87ed = ""}
+
+@STRING{NIPS1   = "Advances in NIPS'88"}
+
+@STRING{NIPS2ed = ""}
+
+@STRING{NIPS2   = "NIPS'89"}
+
+@STRING{NIPS3ed = ""}
+
+@STRING{NIPS3   = "NIPS'90"}
+
+@STRING{NIPS4ed = ""}
+
+@STRING{NIPS4   = "NIPS'91"}
+
+@STRING{NIPS5ed = ""}
+
+@STRING{NIPS5   = "NIPS'92"}
+
+@STRING{NIPS6ed = ""},
+
+@STRING{NIPS6   = "NIPS'93"}
+
+@STRING{NIPS7ed = ""}
+
+@STRING{NIPS7   = "NIPS'94"}
+
+@STRING{NIPS8ed = ""}
+
+@STRING{NIPS8   = "NIPS'95"}
+
+@STRING{NIPS9ed = ""}
+
+@STRING{NIPS9   = "NIPS'96"}
+
+@STRING{NIPS10ed = ""}
+
+@STRING{NIPS10  = "NIPS'97"}
+
+@STRING{NIPS11ed = ""}
+
+@STRING{NIPS11  = "NIPS'98"}
+
+@STRING{NIPS12ed = ""}
+@STRING{NIPS12  = "NIPS'99"}
+@STRING{NIPS13ed = ""}
+@STRING{NIPS13  = "NIPS'00"}
+@STRING{NIPS14ed = ""}
+@STRING{NIPS14 = "NIPS'01"}
+@STRING{NIPS15ed = ""}
+@STRING{NIPS15 = "NIPS'02"}
+@STRING{NIPS16ed = ""}
+@STRING{NIPS16 = "NIPS'03"}
+@string{NIPS16addr = ""}
+@STRING{NIPS17ed = ""}
+@STRING{NIPS17 = "NIPS'04"}
+@STRING{NIPS18ed = ""}
+@STRING{NIPS18 = "NIPS'05"}
+@STRING{NIPS19ed = ""}
+@STRING{NIPS19 = "NIPS'06"}
+@STRING{NIPS20ed = ""}
+@STRING{NIPS20 = "NIPS'07"}
+@STRING{NIPS21ed = ""}
+@STRING{NIPS21 = "NIPS'08"}
+@STRING{NIPS22ed = ""}
+@STRING{NIPS22 = "NIPS'09"}
+
+@String{ICDAR03 =  "Proc. {ICDAR}'03"}
+@String{ICDAR07 =  "Proc. {ICDAR}'07"}
+
+@String{ICML96 = "{ICML} 1996"}
+@String{ICML97 = "{ICML} 1997"}
+@String{ICML98 = "{ICML} 1998"}
+@String{ICML99 = "{ICML} 1999"}
+@String{ICML00 = "{ICML} 2000"}
+@String{ICML01 = "{ICML} 2001"}
+@String{ICML02 = "{ICML} 2002"}
+@String{ICML03 = "{ICML} 2003"}
+@String{ICML04 = "{ICML} 2004"}
+@String{ICML05 = "{ICML} 2005"}
+@String{ICML06 = "{ICML} 2006"}
+@String{ICML07 = "{ICML} 2007"}
+@String{ICML08 = "{ICML} 2008"}
+@String{ICML09 = "{ICML} 2009"}
+@string{icml09loc = {}}
+@STRING{aistats05 = "AISTATS'2005"}
+@STRING{aistats07 = "AISTATS'2007"}
+@STRING{aistats09 = "AISTATS'2009"}
+@String{SVM02 = "SVM2002"}
+@String{UAI09 = {UAI'09}}
+
+@String{iapr =  "IAPR"}
+@String{jprr = "JPRR"}
+
+@string{PhysioNetAuthors = {{Goldberger, A.L., et al.}}}
+@string{PhysioNetNote = ""}
+@string{PhysioNetYear = "2000"}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/writeup/strings.bib	Tue Jun 01 12:13:10 2010 -0400
@@ -0,0 +1,477 @@
+@String{AAAI-85 = "Proceedings of the Fifth National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-86 = "Proceedings of the Sixth National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-87 = "Proceedings of the Seventh National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-88 = "Proceedings of the Eigth National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-89 = "Proceedings of the Ninth National Conference on
+                 Artificial Intelligence"}
+
+
+@String{AAAI-90 = "Proceedings of the Tenth National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-91 = "Proceedings of the 11th National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-92 = "Proceedings of the 12th National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-93 = "Proceedings of the 13th National Conference on
+                 Artificial Intelligence"}
+
+@String{AAAI-94 = "Proceedings of the 14th National Conference on
+                 Artificial Intelligence"}
+
+@String{AI06 = "Advances in Artificial Intelligence, Proceedings of the 19th Conference of the Canadian Society for Computational Studies of Intelligence"}
+
+@String{acmtms = "ACM Transactions on Mathematical Software"}
+
+@STRING{aistats01 = "Proceedings of the Eigth International Workshop on Artificial Intelligence and Statistics (AISTATS'01)"}
+
+@STRING{aistats05 = "Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)"}
+@STRING{aistats05ed = "Robert G. Cowell and Zoubin Ghahramani"}
+
+@STRING{aistats07 = "Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)"}
+@STRING{aistats07-small = "Proceedings of AISTATS-2007"}
+
+@STRING{aistats09 = "Proceedings of The Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS'09)"}
+
+@String{ams = "Ann. Math. Stat."}
+
+@String{annphys = "Annals of Physics"}
+
+@String{ans = "American Nuclear Society, Illinois, USA"}
+
+@String{applopt = "Applied Optics"}
+
+@String{JSM02="Proceedings of 2002 Joint Statistical Meetings"}
+
+@String{bbs =    "Behavioral and Brain Sciences"}
+
+@String{behbio = "Behavioral Biology"}
+
+@String{biocyb = "Biological Cybernetics"}
+
+@String{bmbiol = "Bulletin of Mathematical Biology"}
+
+@String{bmbiophys = "Bulletin of Mathematical Biophysics"}
+
+@String{brain =  "Brain"}
+
+@String{BYTE =   "BYTE"}
+
+@String{cmss88 = "Proceedings of the 1988 Connectionist Models Summer
+           School"}
+
+@String{cogsci = "Cognitive Science"}
+
+@String{colt94 = "Proceedings of the 7th International Conference on Computational Learning Theory (COLT'94)" }
+@String{colt95 = "Proceedings of the 8th International Conference on Computational Learning Theory (COLT'95)" }
+@String{colt98 = "Proceedings of the 11th International Conference on Computational Learning Theory (COLT'98)" }
+@String{colt99 = "Proceedings of the 12th International Conference on Computational Learning Theory (COLT'99)" }
+@String{colt03 = "Proceedings of the 16th International Conference on Computational Learning Theory (COLT'03)" }
+@String{colt04 = "Proceedings of the 17th International Conference on Computational Learning Theory (COLT'04)" }
+
+@String{computer = "Computer"}
+
+@String{connsci = "Connection Science"}
+
+@String{cpc =    "Computer Physics Communications"}
+
+@String{cs =     "Complex Systems"}
+
+@STRING{CSL = "Computers Speech and Language"}
+
+@String{cspla =  "Computer Speech and Language"}
+
+@String{cvgip =  "Computer Vision, Graphics, and Image Processing"}
+
+@String{cvpr83 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'83)"}
+@String{cvpr96 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'96)"}
+@String{cvpr97 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'97)"}
+@String{cvpr99 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'99)"}
+@String{cvpr04 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'04)"}
+@String{cvpr05 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'05)"}
+@String{cvpr06 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'06)"}
+@String{cvpr07 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'07)"}
+@String{cvpr08 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'08)"}
+@String{cvpr09 =  "Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR'09)"}
+
+@String{DIRO= "D\'epartement d'informatique et recherche op\'erationnelle, Universit\'e de Montr\'eal"}
+
+@String{daed =   "D\ae dalus, Proceedings of the American Academy of
+                 Arts and Sciences"}
+
+@String{ECML97 = "Proceedings of the 9th European Conference on Machine Learning (ECML'97)"}
+
+@String{ECML94 = "Proceedings of the 9th European Conference on Machine Learning (ECML'94)"}
+
+@String{ECML02 = "Proceedings of the 9th European Conference on Machine Learning (ECML'02)"}
+
+@String{EEGCN =  "EEG and Clinical Neurophysiology"}
+
+@String{eul =    "Europhysics Letters"}
+
+@string{euro97 = {Proc. Eurospeech '97}}
+
+@string{euro97addr = {Rhodes, Greece}}
+
+@string{euro97month = sep}
+
+@String{febsl =  "FEBS Letters"}
+
+@String{iapr =  "Proceedings of the Joint International Workshops on Advances in Pattern Recognition (IAPR)"}
+
+@String{icassp =  "International Conference on Acoustics, Speech and Signal Processing (ICASSP)"}
+
+@String{ICCV99 =  "Proceedings {IEEE} International Conference on Computer Vision (ICCV'99)"}
+
+@String{ICCV05 =  "Proceedings {IEEE} International Conference on Computer Vision (ICCV'05)"}
+
+@String{ICCV07 =  "Proceedings {IEEE} of the 11th International Conference on Computer Vision (ICCV'07)"}
+
+@String{ICDAR95 =  "3rd International Conference on Document Analysis and Recognition (ICDAR'95)"}
+
+@String{ICDAR03 =  "International Conference on Document Analysis and Recognition (ICDAR'03)"}
+
+@String{ICDAR07 =  "International Conference on Document Analysis and Recognition (ICDAR'07)"}
+
+@String{ICIP07 = "2007 International Conference on Image Processing"}
+
+@String{icnn =   "IEEE International Conference on Neural Networks"}
+
+@STRING{icpr = "International Conference on Pattern Recognition"}
+@STRING{ICPR94 = "International Conference on Pattern Recognition (ICPR'94)"}
+
+@STRING{icslp = "International Conference on Speech and Language Processing"}
+
+@String{ieeeac = "IEEE Transactions on Automatic Control"}
+
+@String{ieeeassp = "IEEE ASSP Magazine"}
+
+@String{ieeeit = "IEEE Transactions on Information Theory"}
+
+@String{ieeesmc = "IEEE Transactions on Systems, Man, and Cybernetics"}
+
+@String{ieeetrnn = "IEEE Transactions on Neural Networks"}
+
+@String{ieeetrkde = "IEEE Transactions on Knowledge and Data
+                 Engineering"}
+
+@String{ieeetassp = "IEEE Transactions on Acoustics, Speech, and Signal
+                 Processing"}
+
+@String{ieeetc = "IEEE Transactions on Computers"}
+
+@String{ieeetcas = "IEEE Transactions on Circuits and Systems"}
+
+@String{ieeetcomm = "IEEE Transactions on Communications"}
+
+@String{ieeetec = "IEEE Transactions on Electronic Computers"}
+
+@String{ieeetpami = "IEEE Transactions on Pattern Analysis and Machine
+                 Intelligence"}
+
+@String{ieeeproc = "Proceedings of the IEEE"}
+
+%@STRING{ijcnn = "IEEE joint conference on neural networks"}
+
+@String{ijcnn =  "International Joint Conference on Neural Networks (IJCNN)"}
+
+@String{ijns =   "International Journal of Neural Systems"}
+
+@String{jama =   "Journal of Mathematical Analysis and Applications"}
+
+@String{jasa =   "Journal of the Acoustical Society of America"}
+
+@String{jcomp =  "Journal of Complexity"}
+
+@String{jcp =    "Journal of Chemical Physics"}
+
+@String{jmathb = "Journal of Mathematical Biology"}
+
+@String{jmlr = "Journal of Machine Learning Research"}
+
+@String{jmolecb = "Journal of Molecular Biology"}
+
+@String{jmp =    "Journal of Mathematical Physics"}
+
+@String{jmpsych = "Journal of Mathematical Psychology"}
+
+@String{jneuro = "Journal of Neuroscience"}
+
+@String{jprr = "Journal of Pattern Recognition Research"}
+
+@String{jpa =    "Journal of Physics A"}
+
+@String{jphysiol = "Journal of Physiology (London)"}
+
+@String{jpp =    "Journal de Physique (Paris)"}
+
+@String{jppl =   "Journal de Physique Lettres (Paris)"}
+
+@String{jtb =    "Journal of Theoretical Biology"}
+
+@String{kyb =    "Kybernetik"}
+
+@String{mbio =   "Mathematical Biosciences"}
+
+@String{mcss =   "Mathematics of Control, Signals, and Systems"}
+
+@String{mlearn = "Machine Learning"}
+
+@String{nature = "Nature"}
+
+@String{network = "Network"}
+
+@String{nc =     "Neural Computation"}
+
+@String{nipc_hmit96 = "Proceedings of the 1996 American Nuclear Society, 
+                      International Topical Meeting on Nuclear Plant 
+                      Instrumentation, Control and Human-Machine Interface 
+                      Technologies"}
+
+@STRING{NIPS = "Advances in Neural Information Processing Systems (NIPS)"}
+@String{nips87 = "Neural Information Processing Systems (NIPS)"}
+@String{nips87ed = "D. Z. Anderson"}
+
+@STRING{NIPS1ed = "D.S.~Touretzky"}
+@STRING{NIPS1publ = "Morgan Kaufmann"}
+@STRING{NIPS1   = "Advances in Neural Information Processing Systems 1 (NIPS'88)"}
+
+@STRING{NIPS2ed = "D.S.~Touretzky"}
+@STRING{NIPS2publ = "Morgan Kaufmann"}
+@STRING{NIPS2   = "Advances in Neural Information Processing Systems 2 (NIPS'89)"}
+
+@STRING{NIPS3ed = "R.P.~Lippman and J.M.~Moody and D.S.~Touretzky"}
+@STRING{NIPS3publ = "Morgan Kaufmann"}
+@STRING{NIPS3   = "Advances in Neural Information Processing Systems 3 (NIPS'90)"}
+
+@STRING{NIPS4ed = "J.E. Moody S.J. Hanson and R.P. Lippmann"}
+@STRING{NIPS4publ = "Morgan Kaufmann"}
+@STRING{NIPS4   = "Advances in Neural Information Processing Systems 4 (NIPS'91)"}
+
+@STRING{NIPS5ed = "C.L.~Giles and S.J.~Hanson and J.D.~Cowan"}
+%editor = {Cowan, Jack  D.  and Tesauro, Gerald   and Alspector, Joshua  },
+@STRING{NIPS5publ = "Morgan Kaufmann"}
+@STRING{NIPS5   = "Advances in Neural Information Processing Systems 5 (NIPS'92)"}
+
+@STRING{NIPS6ed = "D. Cowan and G. Tesauro and J. Alspector"},
+@STRING{NIPS6publ = "MIT Press"}
+@STRING{NIPS6   = "Advances in Neural Information Processing Systems 6 (NIPS'93)"}
+
+@STRING{NIPS7ed = "G.~Tesauro and D.S.~Touretzky and T.K.~Leen"}
+@STRING{NIPS7publ = "MIT Press"}
+@STRING{NIPS7   = "Advances in Neural Information Processing Systems 7 (NIPS'94)"}
+
+@STRING{NIPS8ed = "D.S.~Touretzky and M.C.~Mozer and M.E.~Hasselmo"}
+@STRING{NIPS8publ = "MIT Press"}
+@STRING{NIPS8   = "Advances in Neural Information Processing Systems 8 (NIPS'95)"}
+
+@STRING{NIPS9ed = "M.C.~Mozer and M.I.~Jordan and T.~Petsche"}
+@STRING{NIPS9publ = "MIT Press"}
+@STRING{NIPS9   = "Advances in Neural Information Processing Systems 9 (NIPS'96)"}
+
+@STRING{NIPS10ed = "M.I.~Jordan and M.J.~Kearns and S.A.~Solla"}
+@STRING{NIPS10publ = "MIT Press"}
+@STRING{NIPS10  = "Advances in Neural Information Processing Systems 10 (NIPS'97)"}
+
+@STRING{NIPS11ed = "M.S.~Kearns and S.A.~Solla and D.A.~Cohn"}
+@STRING{NIPS11publ = "MIT Press"}
+@STRING{NIPS11  = "Advances in Neural Information Processing Systems 11 (NIPS'98)"}
+
+@STRING{NIPS12ed = "S.A.~Solla and T.K.~Leen and K-R.~M{\"u}ller"}
+@STRING{NIPS12publ = "MIT Press"}
+@STRING{NIPS12  = "Advances in Neural Information Processing Systems 12 (NIPS'99)"}
+
+@STRING{NIPS13ed = "T.K.~Leen and T.G.~Dietterich and V.~Tresp"}
+@STRING{NIPS13publ = "MIT Press"}
+@STRING{NIPS13  = "Advances in Neural Information Processing Systems 13 (NIPS'00)"}
+
+@STRING{NIPS14ed = "T.G.~Dietterich and S.~Becker and Z.~Ghahramani"}
+@STRING{NIPS14publ = "MIT Press"}
+@STRING{NIPS14  = "Advances in Neural Information Processing Systems 14 (NIPS'01)"}
+
+@STRING{NIPS15ed = "S.~Becker and S.~Thrun and K.~Obermayer"}
+@STRING{NIPS15publ = "MIT Press"}
+@STRING{NIPS15  = "Advances in Neural Information Processing Systems 15 (NIPS'02)"}
+
+@STRING{NIPS16ed = "S.~Thrun and L.~Saul and B.~Sch{\"o}lkopf"}
+@STRING{NIPS16publ = "MIT Press"}
+@STRING{NIPS16  = "Advances in Neural Information Processing Systems 16 (NIPS'03)"}
+@string{NIPS16addr = "Cambridge, MA"}
+
+@STRING{NIPS17ed = "L.K.~Saul and Y.~Weiss and L.~Bottou"}
+@STRING{NIPS17publ = ""}
+@STRING{NIPS17  = "Advances in Neural Information Processing Systems 17 (NIPS'04)"}
+
+@STRING{NIPS18 = "Advances in Neural Information Processing Systems 18 (NIPS'05)"}
+@STRING{NIPS18ed = "Y. Weiss and B. Sch{\"o}lkopf and J. Platt"}
+@STRING{NIPS18publ = "MIT Press"}
+
+@STRING{NIPS19 = "Advances in Neural Information Processing Systems 19 (NIPS'06)"}
+%%full name  editor =       "Bernhard Schölkopf and John Platt and Thomas Hoffman",
+@STRING{NIPS19ed = "B. Sch{\"o}lkopf and J. Platt and T. Hoffman"}
+@STRING{NIPS19publ = "MIT Press"}
+
+@STRING{NIPS20 = "Advances in Neural Information Processing Systems 20 (NIPS'07)"}
+@STRING{NIPS20ed = "J.C. Platt and D. Koller and Y. Singer and S. Roweis"}
+@STRING{NIPS20publ = "MIT Press"}
+
+@STRING{NIPS21 = "Advances in Neural Information Processing Systems 21 (NIPS'08)"}
+@STRING{NIPS21ed = "Daphne Koller and Dale Schuurmans and Yoshua Bengio and Leon Bottou"}
+@STRING{NIPS21publ = ""}
+
+@STRING{NIPS22 = "Advances in Neural Information Processing Systems 22 (NIPS'09)"}
+@STRING{NIPS22ed = "Yoshua Bengio and Dale Schuurmans and Christopher Williams and John Lafferty and Aron Culotta"}
+@STRING{NIPS22publ = ""}
+
+@String{ijprai =   "International Journal of Pattern Recognition and Artificial Intelligence"}
+
+@String{ICML96 = "Proceedings of the Thirteenth International Conference on Machine Learning (ICML'96)"}
+@String{ICML96ed = {L. Saitta}}
+@String{ICML96publ = "Morgan Kaufmann"}
+
+@String{ICML97 = "Proceedings of the Fourteenth International Conference on Machine Learning (ICML'97)"}
+@String{ICML97ed = {Douglas H. Fisher}}
+@String{ICML97publ = "Morgan Kaufmann"}
+
+@String{ICML98 = "Proceedings of the Fifteenth International Conference on Machine Learning (ICML'98)"}
+@String{ICML98ed = {Jude W. Shavlik}}
+@String{ICML98publ = "Morgan Kaufmann"}
+
+@String{ICML99 = "Proceedings of the Sixteenth International Conference on Machine Learning (ICML'99)"}
+@String{ICML99ed = {Ivan Bratko and Saso Dzeroski}}
+@String{ICML99publ = "Morgan Kaufmann"}
+
+@String{ICML01 = "Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)"}
+@String{ICML01ed = {Carla E. Brodley and Andrea Pohoreckyj Danyluk}}
+@String{ICML01publ = "Morgan Kaufmann"}
+
+@String{ICML02 = "Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)"}
+@String{ICML02ed = {Claude Sammut and Achim G. Hoffmann}}
+@String{ICML02publ = "Morgan Kaufmann"}
+
+@String{ICML03 = "Proceedings of the Twenty International Conference on Machine Learning (ICML'03)"}
+@String{ICML03ed = {Tom Fawcett and Nina Mishra}}
+@String{ICML03publ = "AAAI Press"}
+
+@String{ICML04 = "Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)"}
+@String{ICML04ed = {Carla E. Brodley}}
+@String{ICML04publ = "ACM"}
+
+@String{ICML05 = "Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)"}
+@String{ICML05ed = {Luc De Raedt and Stefan Wrobel}}
+@String{ICML05publ = "ACM"}
+
+@String{ICML06 = "Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)"}
+@String{ICML06ed = {William W. Cohen and Andrew Moore}}
+@String{ICML06publ = "ACM"}
+
+@String{ICML07 = "Proceedings of the Twenty-fourth International Conference on Machine Learning (ICML'07)"}
+@String{ICML07ed = {Zoubin Ghahramani}}
+@String{ICML07publ = "ACM"}
+
+@String{ICML08 = "Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)"}
+@String{ICML08ed = "William W. Cohen and Andrew McCallum and Sam T. Roweis"}
+@String{ICML08publ = "ACM"}
+
+@String{ICML09 = "Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)"}
+@String{ICML09ed = {L\'{e}on Bottou and Michael Littman}}
+@String{ICML09publ = "ACM"}
+@string{icml09loc = {Montreal, Quebec, Canada}}
+
+@String{nipc-hmit96 = "The 1996 American Nuclear Society International Topical Meeting on Nuclear Plant Instrumentation, Control and Human Machine Interface Technologies"}
+
+@String{nn =     "Neural Networks"}
+
+@String{nnsupp = "Neural Networks Supplement"}
+
+@String{opteng = "Optical Engineering"}
+
+@String{optlett = "Optics Letters"}
+
+@String{opres =  "Operations Research"}
+
+@String{pdp =    "Parallel Distributed Processing"}
+
+@String{percep = "Perception (London)"}
+
+@String{physicaA = "Physica A"}
+
+@String{physicaD = "Physica D"}
+
+@String{plettA = "Physics Letters A"}
+
+@String{PNAS =   "Proceedings of the National Academy of Sciences, USA"}
+
+@String{prA =    "Physical Review A"}
+
+@String{prB =    "Physical Review B"}
+
+@String{prel =   "Pattern Recognition Letters"}
+
+@String{prl =    "Physical Review Letters"}
+
+@String{PRSLB =  "Proceedings of the Royal Society of London B"}
+
+@String{pscrip = "Physica Scripta"}
+
+@String{psyrev = "Psychological Review"}
+
+@String{PTRSL = "Philosophical Transactions of the Royal Society of London B"}
+
+@String{qrb =    "Quarterly Reviews of Biophysics"}
+
+@String{rmp =    "Reviews of Modern Physics"}
+
+@String{SAML =   "Skrifter for Anvendt Matematik og Lingvistik"}
+
+@String{sciam =  "Scientific American"}
+
+@String{science = "Science"}
+
+@String{SIGLEX97 = {Proceedings of the ACL SIGLEX Workshop on Tagging Text with Lexical Semantics: Why, What, and How?}}
+
+@String{snowbird = "Neural Networks for Computing"}
+
+@String{spcomm = "Speech Communication"}
+@String{SVM02 = "Pattern Recognition with Support Vector Machines"}
+@String{tprobapp = "Theory of Probability and Its Applications"}
+
+@String{UAI00 = {Proceedings of the 16th Conference in Uncertainty in Artificial Intelligence (UAI'00)} }
+@String{UAI03 = {Proceedings of the 19th Conference in Uncertainty in Artificial Intelligence (UAI'03)} }
+@String{UAI05 = {Proceedings of the 21th Conference in Uncertainty in Artificial Intelligence (UAI'07)} }
+@String{UAI07 = {Proceedings of the 23th Conference in Uncertainty in Artificial Intelligence (UAI'07)} }
+@String{UAI09 = {Proceedings of the 25th Conference in Uncertainty in Artificial Intelligence (UAI'09)} }
+
+@String{zpb =    "Zeitschrift fur Physik B"}
+
+@string{PhysioNetAuthors = {Goldberger, A. L. and Amaral, L. A. N. and Glass, L. and
+	   Hausdorff, J. M. and Ivanov, P. Ch. and Mark, R. G. and
+	   Mietus, J. E. and Moody, G. B. and Peng, C.-K. and Stanley, H. E.}}
+@string{PhysioNetNote = "Circulation Electronic Pages:
+         http://circ.ahajournals.org/cgi/content/full/101/23/e215"}
+@string{PhysioNetYear = "2000 (June 13)"}
+
+@string{IEEE_trans_biomed = "{IEEE} Transactions on Bio-medical Engineering"}
+@string{IJCAS = "International Journal of Control, Automation, and Systems"}
+@string{DSP = "Digital Signal Processing"}
+@string{IEEE_trans_SP = "{IEEE} Transactions on Signal Processing"}
+@string{BME = "Biomedical Engineering: Applications, Basis \& Communications"}
+@string{SSDBM2009 = "{SSDBM} 2009: Proceedings of the 21st International Conference on Scientific and Statistical Database Management"}
+@string{PRL = "Pattern Recognition Letters"}
+@string{FTML = "Foundations and Trends in Machine Learning"}
+@string{Bengio2009FTML_note = "Also published as a book. Now Publishers, 2009."}
+@string{JEC = "Journal of Electrocardiology"}
+@string{DMKD = "Data Mining and Knowledge Discovery"}
+@string{IEEE_trans_NN = "IEEE Transactions on Neural Networks"}
+@string{eng_med_bio = "Engineering in Medicine and Biology Magazine, IEEE"}
+@string{ICTAI06 = "IEEE International Conference on Tools with Artificial Intelligence"}
\ No newline at end of file