comparison writeup/aigaion-shorter.bib @ 583:ae77edb9df67

DIRO techreport, sent to arXiv
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Sat, 18 Sep 2010 16:44:46 -0400
parents 7ff00c27c976
children 51213beaed8b
comparison
equal deleted inserted replaced
582:9ebb335ca904 583:ae77edb9df67
407 author = {Bengio, Yoshua and Grandvalet, Yves}, 407 author = {Bengio, Yoshua and Grandvalet, Yves},
408 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation}, 408 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
409 volume = {5}, 409 volume = {5},
410 year = {2004}, 410 year = {2004},
411 pages = {1089--1105}, 411 pages = {1089--1105},
412 crossref = {JMLR-shorter}, 412 journal = {Journal of Machine Learning Research},
413 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.}, 413 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
414 topics={Comparative},cat={J}, 414 topics={Comparative},cat={J},
415 } 415 }
416 416
417 @TECHREPORT{bengio-hyper-TR99, 417 @TECHREPORT{bengio-hyper-TR99,
1087 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian}, 1087 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
1088 title = {A Neural Probabilistic Language Model}, 1088 title = {A Neural Probabilistic Language Model},
1089 volume = {3}, 1089 volume = {3},
1090 year = {2003}, 1090 year = {2003},
1091 pages = {1137--1155}, 1091 pages = {1137--1155},
1092 crossref = {JMLR-shorter}, 1092 journal = {Journal of Machine Learning Research},
1093 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.}, 1093 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
1094 topics={Markov,Unsupervised,Language},cat={J}, 1094 topics={Markov,Unsupervised,Language},cat={J},
1095 } 1095 }
1096 1096
1097 @TECHREPORT{bengio:socs-1990, 1097 @TECHREPORT{bengio:socs-1990,
1567 1567
1568 @ARTICLE{chapados:2003, 1568 @ARTICLE{chapados:2003,
1569 author = {Bengio, Yoshua and Chapados, Nicolas}, 1569 author = {Bengio, Yoshua and Chapados, Nicolas},
1570 title = {Extensions to Metric-Based Model Selection}, 1570 title = {Extensions to Metric-Based Model Selection},
1571 year = {2003}, 1571 year = {2003},
1572 crossref = {JMLR-shorter}, 1572 journal = {Journal of Machine Learning Research},
1573 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.}, 1573 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
1574 topics={ModelSelection,Finance},cat={J}, 1574 topics={ModelSelection,Finance},cat={J},
1575 } 1575 }
1576 1576
1577 @ARTICLE{chapelle:2001, 1577 @ARTICLE{chapelle:2001,
2253 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy}, 2253 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
2254 title = {Why Does Unsupervised Pre-training Help Deep Learning?}, 2254 title = {Why Does Unsupervised Pre-training Help Deep Learning?},
2255 volume = {11}, 2255 volume = {11},
2256 year = {2010}, 2256 year = {2010},
2257 pages = {625--660}, 2257 pages = {625--660},
2258 crossref = {JMLR-shorter}, 2258 journal = {Journal of Machine Learning Research},
2259 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.} 2259 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
2260 } 2260 }
2261 2261
2262 @INPROCEEDINGS{Erhan-aistats-2010, 2262 @INPROCEEDINGS{Erhan-aistats-2010,
2263 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal}, 2263 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
2750 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome}, 2750 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
2751 title = {Image Classification using Higher-Order Neural Models}, 2751 title = {Image Classification using Higher-Order Neural Models},
2752 year = {2008}, 2752 year = {2008},
2753 howpublished = {The Learning Workshop (Snowbird, Utah)}, 2753 howpublished = {The Learning Workshop (Snowbird, Utah)},
2754 url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf} 2754 url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
2755 }
2756
2757 @ARTICLE{JMLR-short,
2758 journal = {JMLR},
2759 year = {-1}
2760 } 2755 }
2761 2756
2762 2757
2763 @INPROCEEDINGS{Kegl+Bertin+Eck-2008, 2758 @INPROCEEDINGS{Kegl+Bertin+Eck-2008,
2764 author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas}, 2759 author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
2868 author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal}, 2863 author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
2869 title = {Exploring Strategies for Training Deep Neural Networks}, 2864 title = {Exploring Strategies for Training Deep Neural Networks},
2870 volume = {10}, 2865 volume = {10},
2871 year = {2009}, 2866 year = {2009},
2872 pages = {1--40}, 2867 pages = {1--40},
2873 crossref = {JMLR-shorter}, 2868 journal = {Journal of Machine Learning Research},
2874 abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.} 2869 abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
2875 } 2870 }
2876 2871
2877 @PHDTHESIS{Larochelle-PhD-2009, 2872 @PHDTHESIS{Larochelle-PhD-2009,
2878 author = {Larochelle, Hugo}, 2873 author = {Larochelle, Hugo},
3989 @ARTICLE{Sonnenburg+al-2007, 3984 @ARTICLE{Sonnenburg+al-2007,
3990 author = {Sonnenburg, Soeren and et al. and Vincent, Pascal}, 3985 author = {Sonnenburg, Soeren and et al. and Vincent, Pascal},
3991 title = {The Need for Open Source Software in Machine Learning.}, 3986 title = {The Need for Open Source Software in Machine Learning.},
3992 year = {2007}, 3987 year = {2007},
3993 note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)}, 3988 note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
3994 crossref = {JMLR-shorter}, 3989 journal = {Journal of Machine Learning Research},
3995 abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C. 3990 abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.
3996 3991
3997 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.} 3992 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
3998 } 3993 }
3999 3994
4464 booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)}, 4459 booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
4465 year = {-1}, 4460 year = {-1},
4466 publisher = {MIT Press} 4461 publisher = {MIT Press}
4467 } 4462 }
4468 4463
4469 @ARTICLE{JMLR,
4470 journal = {Journal of Machine Learning Research},
4471 year = {-1}
4472 }
4473
4474 @INPROCEEDINGS{NIPS19, 4464 @INPROCEEDINGS{NIPS19,
4475 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas}, 4465 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
4476 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)}, 4466 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4477 booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)}, 4467 booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4478 year = {-1}, 4468 year = {-1},
4700 title = {ICML'99}, 4690 title = {ICML'99},
4701 booktitle = {ICML'99}, 4691 booktitle = {ICML'99},
4702 year = {-1}, 4692 year = {-1},
4703 publisher = {Morgan Kaufmann} 4693 publisher = {Morgan Kaufmann}
4704 } 4694 }
4705 @ARTICLE{JMLR-shorter,
4706 journal = {JMLR},
4707 year = {-1}
4708 }
4709 @INPROCEEDINGS{NIPS1-shorter, 4695 @INPROCEEDINGS{NIPS1-shorter,
4710 title = {NIPS'88}, 4696 title = {NIPS'88},
4711 booktitle = {NIPS 1}, 4697 booktitle = {NIPS 1},
4712 year = {-1}, 4698 year = {-1},
4713 publisher = {Morgan Kaufmann} 4699 publisher = {Morgan Kaufmann}
4838 @INPROCEEDINGS{xAISTATS2009-shorter, 4824 @INPROCEEDINGS{xAISTATS2009-shorter,
4839 title = {AISTATS'2009}, 4825 title = {AISTATS'2009},
4840 booktitle = {AISTATS'2009}, 4826 booktitle = {AISTATS'2009},
4841 year = {-1} 4827 year = {-1}
4842 } 4828 }
4829
4830