comparison writeup/aigaion-shorter.bib @ 604:51213beaed8b

draft of NIPS 2010 workshop camera-ready version
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 22 Nov 2010 14:52:33 -0500
parents ae77edb9df67
children
comparison
equal deleted inserted replaced
603:eb6244c6d861 604:51213beaed8b
1 %Aigaion2 BibTeX export from LISA - Publications 1 %Aigaion2 BibTeX export from LISA - Publications
2 %Tuesday 01 June 2010 10:46:52 AM 2 %Tuesday 02 November 2010 04:10:50 PM
3 @MASTERSTHESIS{,
4 author = {Breuleux, Olivier},
5 title = {{\'{E}}chantillonnage dynamique de champs markoviens},
6 year = {2010},
7 school = {Universit{\'{e}} de Montr{\'{e}}al}
8 }
9
10 @PHDTHESIS{,
11 author = {Rivest, Fran{\c c}ois},
12 title = {Mod{\`{e}}le informatique du coapprentissage des ganglions de la base et du cortex : L’apprentissage par renforcement et le d{\'{e}}veloppement de repr{\'{e}}sentations},
13 year = {2009},
14 school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d’informatique et de recherche op{\'{e}}rationnelle},
15 abstract = {English follow:
16
17 Tout au long de la vie, le cerveau d{\'{e}}veloppe des repr{\'{e}}sentations de son
18 environnement permettant {\`{a}} l’individu d’en tirer meilleur profit. Comment ces
19 repr{\'{e}}sentations se d{\'{e}}veloppent-elles pendant la qu{\^{e}}te de r{\'{e}}compenses demeure un
20 myst{\`{e}}re. Il est raisonnable de penser que le cortex est le si{\`{e}}ge de ces repr{\'{e}}sentations
21 et que les ganglions de la base jouent un r{\^{o}}le important dans la maximisation des
22 r{\'{e}}compenses. En particulier, les neurones dopaminergiques semblent coder un signal
23 d’erreur de pr{\'{e}}diction de r{\'{e}}compense. Cette th{\`{e}}se {\'{e}}tudie le probl{\`{e}}me en construisant,
24 {\`{a}} l’aide de l’apprentissage machine, un mod{\`{e}}le informatique int{\'{e}}grant de nombreuses
25 {\'{e}}vidences neurologiques.
26 Apr{\`{e}}s une introduction au cadre math{\'{e}}matique et {\`{a}} quelques algorithmes de
27 l’apprentissage machine, un survol de l’apprentissage en psychologie et en
28 neuroscience et une revue des mod{\`{e}}les de l’apprentissage dans les ganglions de la
29 base, la th{\`{e}}se comporte trois articles. Le premier montre qu’il est possible
30 d’apprendre {\`{a}} maximiser ses r{\'{e}}compenses tout en d{\'{e}}veloppant de meilleures
31 repr{\'{e}}sentations des entr{\'{e}}es. Le second article porte sur l'important probl{\`{e}}me toujours
32 non r{\'{e}}solu de la repr{\'{e}}sentation du temps. Il d{\'{e}}montre qu’une repr{\'{e}}sentation du temps
33 peut {\^{e}}tre acquise automatiquement dans un r{\'{e}}seau de neurones artificiels faisant
34 office de m{\'{e}}moire de travail. La repr{\'{e}}sentation d{\'{e}}velopp{\'{e}}e par le mod{\`{e}}le ressemble
35 beaucoup {\`{a}} l’activit{\'{e}} de neurones corticaux dans des t{\^{a}}ches similaires. De plus, le
36 mod{\`{e}}le montre que l’utilisation du signal d’erreur de r{\'{e}}compense peut acc{\'{e}}l{\'{e}}rer la
37 construction de ces repr{\'{e}}sentations temporelles. Finalement, il montre qu’une telle
38 repr{\'{e}}sentation acquise automatiquement dans le cortex peut fournir l’information
39 n{\'{e}}cessaire aux ganglions de la base pour expliquer le signal dopaminergique. Enfin,
40 le troisi{\`{e}}me article {\'{e}}value le pouvoir explicatif et pr{\'{e}}dictif du mod{\`{e}}le sur diff{\'{e}}rentes
41 situations comme la pr{\'{e}}sence ou l’absence d’un stimulus (conditionnement classique
42 ou de trace) pendant l’attente de la r{\'{e}}compense. En plus de faire des pr{\'{e}}dictions tr{\`{e}}s
43 int{\'{e}}ressantes en lien avec la litt{\'{e}}rature sur les intervalles de temps, l’article r{\'{e}}v{\`{e}}le
44 certaines lacunes du mod{\`{e}}le qui devront {\^{e}}tre am{\'{e}}lior{\'{e}}es.
45 Bref, cette th{\`{e}}se {\'{e}}tend les mod{\`{e}}les actuels de l’apprentissage des ganglions de
46 la base et du syst{\`{e}}me dopaminergique au d{\'{e}}veloppement concurrent de
47 repr{\'{e}}sentations temporelles dans le cortex et aux interactions de ces deux structures.
48
49 Throughout lifetime, the brain develops abstract representations of its
50 environment that allow the individual to maximize his benefits. How these
51 representations are developed while trying to acquire rewards remains a mystery. It is
52 reasonable to assume that these representations arise in the cortex and that the basal
53 ganglia are playing an important role in reward maximization. In particular,
54 dopaminergic neurons appear to code a reward prediction error signal. This thesis
55 studies the problem by constructing, using machine learning tools, a computational
56 model that incorporates a number of relevant neurophysiological findings.
57 After an introduction to the machine learning framework and to some of its
58 algorithms, an overview of learning in psychology and neuroscience, and a review of
59 models of learning in the basal ganglia, the thesis comprises three papers. The first
60 article shows that it is possible to learn a better representation of the inputs while
61 learning to maximize reward. The second paper addresses the important and still
62 unresolved problem of the representation of time in the brain. The paper shows that a
63 time representation can be acquired automatically in an artificial neural network
64 acting like a working memory. The representation learned by the model closely
65 resembles the activity of cortical neurons in similar tasks. Moreover, the model shows
66 that the reward prediction error signal could accelerate the development of the
67 temporal representation. Finally, it shows that if such a learned representation exists
68 in the cortex, it could provide the necessary information to the basal ganglia to
69 explain the dopaminergic signal. The third article evaluates the explanatory and
70 predictive power of the model on the effects of differences in task conditions such as
71 the presence or absence of a stimulus (classical versus trace conditioning) while
72 waiting for the reward. Beyond making interesting predictions relevant to the timing
73 literature, the paper reveals some shortcomings of the model that will need to be
74 resolved.
75 In summary, this thesis extends current models of reinforcement learning of
76 the basal ganglia and the dopaminergic system to the concurrent development of
77 representation in the cortex and to the interactions between these two regions.}
78 }
79
80 @MASTERSTHESIS{,
81 author = {Wood, Sean},
82 title = {Non-negative matrix decomposition approaches to frequency domain analysis of music audio signals},
83 year = {2010},
84 school = {Universit{\'{e}} de Montr{\'{e}}al}
85 }
86
87 @TECHREPORT{ARXIV-2010,
88 author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Bengio, Yoshua and Bergeron, Arnaud and Boulanger-Lewandowski, Nicolas and Breuel, Thomas and Chherawala, Youssouf and Cisse, Moustapha and C{\^{o}}t{\'{e}}, Myriam and Erhan, Dumitru and Eustache, Jeremy and Glorot, Xavier and Muller, Xavier and Pannetier Lebeuf, Sylvain and Pascanu, Razvan and Rifai, Salah and Savard, Fran{\c c}ois and Sicard, Guillaume},
89 keywords = {Computer Vision and Pattern Recognition, Learning, Neural and Evolutionary Computing},
90 title = {Deep Self-Taught Learning for Handwritten Character Recognition},
91 number = {1353},
92 year = {2010},
93 institution = {University of Montr{\'{e}}al},
94 abstract = {Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition.}
95 }
96
3 @INPROCEEDINGS{Attardi+al-2009, 97 @INPROCEEDINGS{Attardi+al-2009,
4 author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph}, 98 author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
5 keywords = {classifier, dependency parsing, natural language, parser, perceptron}, 99 keywords = {classifier, dependency parsing, natural language, parser, perceptron},
6 title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron}, 100 title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron},
7 booktitle = {Proceeding of Evalita 2009}, 101 booktitle = {Proceeding of Evalita 2009},
407 author = {Bengio, Yoshua and Grandvalet, Yves}, 501 author = {Bengio, Yoshua and Grandvalet, Yves},
408 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation}, 502 title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation},
409 volume = {5}, 503 volume = {5},
410 year = {2004}, 504 year = {2004},
411 pages = {1089--1105}, 505 pages = {1089--1105},
412 journal = {Journal of Machine Learning Research}, 506 crossref = {JMLR-shorter},
413 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.}, 507 abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don’t take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
414 topics={Comparative},cat={J}, 508 topics={Comparative},cat={J},
415 } 509 }
416 510
417 @TECHREPORT{bengio-hyper-TR99, 511 @TECHREPORT{bengio-hyper-TR99,
1087 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian}, 1181 author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian},
1088 title = {A Neural Probabilistic Language Model}, 1182 title = {A Neural Probabilistic Language Model},
1089 volume = {3}, 1183 volume = {3},
1090 year = {2003}, 1184 year = {2003},
1091 pages = {1137--1155}, 1185 pages = {1137--1155},
1092 journal = {Journal of Machine Learning Research}, 1186 crossref = {JMLR-shorter},
1093 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.}, 1187 abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
1094 topics={Markov,Unsupervised,Language},cat={J}, 1188 topics={Markov,Unsupervised,Language},cat={J},
1095 } 1189 }
1096 1190
1097 @TECHREPORT{bengio:socs-1990, 1191 @TECHREPORT{bengio:socs-1990,
1230 On all three tasks, both the quadratic interactions and the gentler non-linearity 1324 On all three tasks, both the quadratic interactions and the gentler non-linearity
1231 lead to significantly better generalization. 1325 lead to significantly better generalization.
1232 The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.} 1326 The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.}
1233 } 1327 }
1234 1328
1235 @MISC{bergstra+al:2010-scipy, 1329 @ARTICLE{Bergstra+al-2010,
1236 author = {Bergstra, James}, 1330 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
1237 title = {Optimized Symbolic Expressions and {GPU} Metaprogramming with Theano}, 1331 title = {Suitability of V1 Energy Models for Object Classification},
1238 year = {2010}, 1332 journal = {Neural Computation},
1239 howpublished = {{SciPy}}, 1333 year = {2010},
1240 note = {Oral} 1334 note = {to appear}
1335 }
1336
1337 @INPROCEEDINGS{bergstra+al:2010-scipy,
1338 author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Pascanu, Razvan and Desjardins, Guillaume and Turian, Joseph and Bengio, Yoshua},
1339 title = {Theano: a {CPU} and {GPU} Math Expression Compiler},
1340 booktitle = {Proceedings of the Python for Scientific Computing Conference ({SciPy})},
1341 year = {2010},
1342 note = {Oral}
1241 } 1343 }
1242 1344
1243 @MISC{bergstra+al:2010-sharcnet, 1345 @MISC{bergstra+al:2010-sharcnet,
1244 author = {Bergstra, James and Bengio, Yoshua}, 1346 author = {Bergstra, James and Bengio, Yoshua},
1245 title = {{GPU} Programming with Theano}, 1347 title = {{GPU} Programming with Theano},
1255 year = {2010}, 1357 year = {2010},
1256 note = {Oral} 1358 note = {Oral}
1257 } 1359 }
1258 1360
1259 @INPROCEEDINGS{Bergstra+Bengio-2009, 1361 @INPROCEEDINGS{Bergstra+Bengio-2009,
1260 author = {Bergstra, James and Bengio, Yoshua}, 1362 author = {Bergstra, James and Bengio, Yoshua},
1261 title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks}, 1363 title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks},
1262 year = {2009}, 1364 year = {2009},
1263 crossref = {NIPS22} 1365 pages = {99--107},
1366 publisher = {MIT Press},
1367 url = {http://books.nips.cc/papers/files/nips22/NIPS2009_0933.pdf},
1368 crossref = {NIPS22}
1264 } 1369 }
1265 1370
1266 @ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006, 1371 @ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006,
1267 author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs}, 1372 author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs},
1268 title = {Aggregate Features and AdaBoost for Music Classification}, 1373 title = {Aggregate Features and AdaBoost for Music Classification},
1277 } 1382 }
1278 1383
1279 @INPROCEEDINGS{bergstra+lacoste+eck:2006, 1384 @INPROCEEDINGS{bergstra+lacoste+eck:2006,
1280 author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas}, 1385 author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas},
1281 title = {Predicting Genre Labels for Artists using FreeDB}, 1386 title = {Predicting Genre Labels for Artists using FreeDB},
1282 booktitle = {Proc. 7th International Conference on Music Information Retrieval (ISMIR)}, 1387 booktitle = {Proc. 7th International Conference on Music Information Retrieval ({ISMIR})},
1283 year = {2006}, 1388 year = {2006},
1389 pages = {85--88},
1390 publisher = {University of Victoria},
1284 SOURCE = {OwnPublication}, 1391 SOURCE = {OwnPublication},
1285 PDF = {papers/2006_ismir_freedb.pdf}, 1392 PDF = {papers/2006_ismir_freedb.pdf},
1286 } 1393 }
1287 1394
1288 @INPROCEEDINGS{bergstra+mandel+eck:2010, 1395 @INPROCEEDINGS{bergstra+mandel+eck:2010,
1289 author = {Bergstra, James and Mandel, Michael and Eck, Douglas}, 1396 author = {Bergstra, James and Mandel, Michael and Eck, Douglas},
1290 title = {Scalable Genre and Tag Prediction with Spectral Covariance}, 1397 title = {Scalable Genre and Tag Prediction with Spectral Covariance},
1291 booktitle = {{ISMIR}}, 1398 booktitle = {{ISMIR}},
1292 year = {2010}, 1399 year = {2010},
1293 note = {accepted} 1400 pages = {507--512},
1294 } 1401 }
1295 1402
1296 @MASTERSTHESIS{Bergstra-Msc-2006, 1403 @MASTERSTHESIS{Bergstra-Msc-2006,
1297 author = {Bergstra, James}, 1404 author = {Bergstra, James},
1298 keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale}, 1405 keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale},
1389 title = {Global Training of Document Processing Systems using Graph Transformer Networks}, 1496 title = {Global Training of Document Processing Systems using Graph Transformer Networks},
1390 booktitle = {Proc. of Computer Vision and Pattern Recognition}, 1497 booktitle = {Proc. of Computer Vision and Pattern Recognition},
1391 year = {1997}, 1498 year = {1997},
1392 pages = {490--494}, 1499 pages = {490--494},
1393 publisher = {IEEE}, 1500 publisher = {IEEE},
1394 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.ps.gz}, 1501 url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.pdf},
1395 topics={PriorKnowledge,Speech},cat={C}, 1502 topics={PriorKnowledge,Speech},cat={C},
1396 } 1503 }
1397 1504
1398 @TECHREPORT{bottou96TR, 1505 @TECHREPORT{bottou96TR,
1399 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann}, 1506 author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann},
1427 @PHDTHESIS{Boufaden-Phd-2005, 1534 @PHDTHESIS{Boufaden-Phd-2005,
1428 author = {Boufaden, Narj{\`{e}}s}, 1535 author = {Boufaden, Narj{\`{e}}s},
1429 title = {Extraction d’information {\`{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es}, 1536 title = {Extraction d’information {\`{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es},
1430 year = {2005}, 1537 year = {2005},
1431 school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel} 1538 school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel}
1539 }
1540
1541 @TECHREPORT{Breuleux+al-TR-2010,
1542 author = {Breuleux, Olivier and Bengio, Yoshua and Vincent, Pascal},
1543 title = {Unlearning for Better Mixing},
1544 number = {1349},
1545 year = {2010},
1546 institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO},
1547 abstract = {Two learning algorithms were recently proposed – Herding and Fast Persistent Contrastive Divergence (FPCD) – which share the following interesting characteristic: they exploit changes in the model parameters while sampling in order to escape modes and mix better, during the sampling process that is part of the learning algorithm. We first justify such approaches as ways to escape modes while approximately keeping the same asymptotic distribution of the {Markov} chain. We then extend FPCD using an idea borrowed from Herding in order to obtain a pure sampling algorithm and show empirically that this FPCD-sampler yields substantially better samples than Gibbs sampling. Because these algorithms entangle the model and the sampling algorithm and we want to evaluate both (but particularly how well the sampling schemes mix), it is not always easy to evaluate them, so we propose a “black-box” approach based on how well and how quickly the samples generated by a model “cover” the test set examples. We empirically study these algorithms and variations with this perspective and these new evaluation tools in order to better understand their strengths and limitations.}
1432 } 1548 }
1433 1549
1434 @INPROCEEDINGS{Carreau+Bengio-2007, 1550 @INPROCEEDINGS{Carreau+Bengio-2007,
1435 author = {Carreau, Julie and Bengio, Yoshua}, 1551 author = {Carreau, Julie and Bengio, Yoshua},
1436 title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data}, 1552 title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data},
1442 date={21-24} 1558 date={21-24}
1443 } 1559 }
1444 1560
1445 @ARTICLE{Carreau+Bengio-2009, 1561 @ARTICLE{Carreau+Bengio-2009,
1446 author = {Carreau, Julie and Bengio, Yoshua}, 1562 author = {Carreau, Julie and Bengio, Yoshua},
1447 title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distributio\ n}, 1563 title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distribution},
1448 journal = {IEEE Transactions on Neural Networks}, 1564 journal = {IEEE Transactions on Neural Networks},
1449 volume = {20}, 1565 volume = {20},
1450 number = {7}, 1566 number = {7},
1451 year = {2009}, 1567 year = {2009},
1452 pages = {1087--1101}, 1568 pages = {1087--1101},
1567 1683
1568 @ARTICLE{chapados:2003, 1684 @ARTICLE{chapados:2003,
1569 author = {Bengio, Yoshua and Chapados, Nicolas}, 1685 author = {Bengio, Yoshua and Chapados, Nicolas},
1570 title = {Extensions to Metric-Based Model Selection}, 1686 title = {Extensions to Metric-Based Model Selection},
1571 year = {2003}, 1687 year = {2003},
1572 journal = {Journal of Machine Learning Research}, 1688 crossref = {JMLR-shorter},
1573 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.}, 1689 abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.},
1574 topics={ModelSelection,Finance},cat={J}, 1690 topics={ModelSelection,Finance},cat={J},
1575 } 1691 }
1576 1692
1577 @ARTICLE{chapelle:2001, 1693 @ARTICLE{chapelle:2001,
1787 } 1903 }
1788 1904
1789 @INPROCEEDINGS{Desjardins+al-2010, 1905 @INPROCEEDINGS{Desjardins+al-2010,
1790 author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua}, 1906 author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua},
1791 title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine}, 1907 title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machine},
1792 booktitle = {Proceedings of AISTATS 2010}, 1908 booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)},
1793 volume = {9}, 1909 volume = {9},
1794 year = {2010}, 1910 year = {2010},
1795 pages = {145-152}, 1911 pages = {145--152},
1796 abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.} 1912 abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.}
1797 } 1913 }
1798 1914
1799 @TECHREPORT{Desjardins-2008, 1915 @TECHREPORT{Desjardins-2008,
1800 author = {Desjardins, Guillaume and Bengio, Yoshua}, 1916 author = {Desjardins, Guillaume and Bengio, Yoshua},
2253 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy}, 2369 author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
2254 title = {Why Does Unsupervised Pre-training Help Deep Learning?}, 2370 title = {Why Does Unsupervised Pre-training Help Deep Learning?},
2255 volume = {11}, 2371 volume = {11},
2256 year = {2010}, 2372 year = {2010},
2257 pages = {625--660}, 2373 pages = {625--660},
2258 journal = {Journal of Machine Learning Research}, 2374 crossref = {JMLR-shorter},
2259 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.} 2375 abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.}
2260 } 2376 }
2261 2377
2262 @INPROCEEDINGS{Erhan-aistats-2010, 2378 @INPROCEEDINGS{Erhan-aistats-2010,
2263 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal}, 2379 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
2296 concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre 2412 concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre
2297 apport est l'utilisation de notre algorithme par les compagnies 2413 apport est l'utilisation de notre algorithme par les compagnies
2298 pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.} 2414 pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.}
2299 } 2415 }
2300 2416
2417 @TECHREPORT{Erhan-vis-techreport-2010,
2418 author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua},
2419 title = {Understanding Representations Learned in Deep Architectures},
2420 number = {1355},
2421 year = {2010},
2422 institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO},
2423 abstract = {Deep architectures have demonstrated state-of-the-art performance in a variety of
2424 settings, especially with vision datasets. Deep learning algorithms are based on learning
2425 several levels of representation of the input. Beyond test-set performance, there
2426 is a need for qualitative comparisons of the solutions learned by various deep architectures,
2427 focused on those learned representations. One of the goals of our research
2428 is to improve tools for finding good qualitative interpretations of high level features
2429 learned by such models. We also seek to gain insight into the invariances learned by
2430 deep networks. To this end, we contrast and compare several techniques for finding
2431 such interpretations. We applied our techniques on Stacked Denoising Auto-Encoders
2432 and Deep Belief Networks, trained on several vision datasets. We show that consistent
2433 filter-like interpretation is possible and simple to accomplish at the unit level. The tools
2434 developed make it possible to analyze deep models in more depth and accomplish the
2435 tracing of invariance manifolds for each of the hidden units. We hope that such techniques
2436 will allow researchers in deep architectures to understand more of how and why
2437 deep architectures work.}
2438 }
2439
2301 @INPROCEEDINGS{Erhan2009, 2440 @INPROCEEDINGS{Erhan2009,
2302 author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal}, 2441 author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
2303 keywords = {Deep Networks}, 2442 keywords = {Deep Networks},
2304 title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training}, 2443 title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training},
2305 year = {2009}, 2444 year = {2009},
2750 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome}, 2889 author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
2751 title = {Image Classification using Higher-Order Neural Models}, 2890 title = {Image Classification using Higher-Order Neural Models},
2752 year = {2008}, 2891 year = {2008},
2753 howpublished = {The Learning Workshop (Snowbird, Utah)}, 2892 howpublished = {The Learning Workshop (Snowbird, Utah)},
2754 url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf} 2893 url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
2894 }
2895
2896 @ARTICLE{JMLR-short,
2897 journal = {JMLR},
2898 year = {-1}
2755 } 2899 }
2756 2900
2757 2901
2758 @INPROCEEDINGS{Kegl+Bertin+Eck-2008, 2902 @INPROCEEDINGS{Kegl+Bertin+Eck-2008,
2759 author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas}, 2903 author = {K{\'{e}}gl, Bal{\'{a}}zs and Bertin-Mahieux, Thierry and Eck, Douglas},
2831 2975
2832 @ARTICLE{Larochelle+al-2010, 2976 @ARTICLE{Larochelle+al-2010,
2833 author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph}, 2977 author = {Larochelle, Hugo and Bengio, Yoshua and Turian, Joseph},
2834 title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest}, 2978 title = {Tractable Multivariate Binary Density Estimation and the Restricted {Boltzmann} Forest},
2835 journal = {Neural Computation}, 2979 journal = {Neural Computation},
2980 volume = {22},
2981 number = {9},
2836 year = {2010}, 2982 year = {2010},
2837 note = {To appear} 2983 pages = {2285--2307}
2838 } 2984 }
2839 2985
2840 @INPROCEEDINGS{Larochelle+Bengio-2008, 2986 @INPROCEEDINGS{Larochelle+Bengio-2008,
2841 author = {Larochelle, Hugo and Bengio, Yoshua}, 2987 author = {Larochelle, Hugo and Bengio, Yoshua},
2842 title = {Classification using Discriminative Restricted {B}oltzmann Machines}, 2988 title = {Classification using Discriminative Restricted {B}oltzmann Machines},
2863 author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal}, 3009 author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
2864 title = {Exploring Strategies for Training Deep Neural Networks}, 3010 title = {Exploring Strategies for Training Deep Neural Networks},
2865 volume = {10}, 3011 volume = {10},
2866 year = {2009}, 3012 year = {2009},
2867 pages = {1--40}, 3013 pages = {1--40},
2868 journal = {Journal of Machine Learning Research}, 3014 crossref = {JMLR-shorter},
2869 abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.} 3015 abstract = {Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layer-wise unsupervised learning procedure relying on the training algorithm of restricted {Boltzmann} machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layer-wise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layer-wise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are high-level abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as on-line versions of those algorithms.}
2870 } 3016 }
2871 3017
2872 @PHDTHESIS{Larochelle-PhD-2009, 3018 @PHDTHESIS{Larochelle-PhD-2009,
2873 author = {Larochelle, Hugo}, 3019 author = {Larochelle, Hugo},
3027 3173
3028 @ARTICLE{LeRoux+Bengio-2010, 3174 @ARTICLE{LeRoux+Bengio-2010,
3029 author = {Le Roux, Nicolas and Bengio, Yoshua}, 3175 author = {Le Roux, Nicolas and Bengio, Yoshua},
3030 title = {Deep Belief Networks are Compact Universal Approximators}, 3176 title = {Deep Belief Networks are Compact Universal Approximators},
3031 journal = {Neural Computation}, 3177 journal = {Neural Computation},
3178 volume = {22},
3179 number = {8},
3032 year = {2010}, 3180 year = {2010},
3033 note = {To appear} 3181 pages = {2192-2207},
3182 issn = {0899-7667},
3183 abstract = {Deep Belief Networks (DBN) are generative models with many layers of hidden causal variables, recently introduced by Hinton et al. (2006), along with a greedy layer-wise unsupervised learning algorithm. Building on Le Roux and Bengio (2008) and Sutskever and Hinton (2008), we show that deep but narrow generative networks do not require more parameters than shallow ones to achieve universal approximation. Exploiting the proof technique, we prove that deep but narrow feed-forward neural networks with sigmoidal units can represent any Boolean expression.}
3034 } 3184 }
3035 3185
3036 @TECHREPORT{LeRoux-Bengio-2007-TR, 3186 @TECHREPORT{LeRoux-Bengio-2007-TR,
3037 author = {Le Roux, Nicolas and Bengio, Yoshua}, 3187 author = {Le Roux, Nicolas and Bengio, Yoshua},
3038 title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks}, 3188 title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
3984 @ARTICLE{Sonnenburg+al-2007, 4134 @ARTICLE{Sonnenburg+al-2007,
3985 author = {Sonnenburg, Soeren and et al. and Vincent, Pascal}, 4135 author = {Sonnenburg, Soeren and et al. and Vincent, Pascal},
3986 title = {The Need for Open Source Software in Machine Learning.}, 4136 title = {The Need for Open Source Software in Machine Learning.},
3987 year = {2007}, 4137 year = {2007},
3988 note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)}, 4138 note = {institution: Fraunhofer Publica [http://publica.fraunhofer.de/oai.har] (Germany)},
3989 journal = {Journal of Machine Learning Research}, 4139 crossref = {JMLR-shorter},
3990 abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C. 4140 abstract = {all authors: Sonnenburg, S. and Braun, M.L. and Ong, C.S. and Bengio, S. and Bottou, L. and Holmes, G. and {LeCun}, Y. and M{\~{A}}¼ller, K.-R. and Pereira, F. and Rasmussen, C.E. and R{\~{A}}¤tsch, G. and Sch{\~{A}}{\P}lkopf, B. and Smola, A. and Vincent, P. and Weston, J. and Williamson, R.C.
3991 4141
3992 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.} 4142 Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not used, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}
3993 } 4143 }
3994 4144
4000 number = {10}, 4150 number = {10},
4001 year = {2002}, 4151 year = {2002},
4002 pages = {2469--2496}, 4152 pages = {2469--2496},
4003 abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they down-weight the influence of so-called outliers. When the noise distribution is assymetric these methods yield biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regession that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust etimators of the regression), and to lern a few remaining parameters to combbine and correct these stimators, to unbiasedly minimize the average squared error. Theoritical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.}, 4153 abstract = {In the presence of a heavy-tail noise distribution, regression becomes much more difficult. Traditional robust regression methods assume that the noise distribution is symmetric and they down-weight the influence of so-called outliers. When the noise distribution is assymetric these methods yield biased regression estimators. Motivated by data-mining problems for the insurance industry, we propose in this paper a new approach to robust regession that is tailored to deal with the case where the noise distribution is asymmetric. The main idea is to learn most of the parameters of the model using conditional quantile estimators (which are biased but robust etimators of the regression), and to lern a few remaining parameters to combbine and correct these stimators, to unbiasedly minimize the average squared error. Theoritical analysis and experiments show the clear advantages of the approach. Results are on artificial data as well as real insurance data, using both linear and neural-network predictors.},
4004 topics={Mining},cat={J}, 4154 topics={Mining},cat={J},
4155 }
4156
4157 @PHDTHESIS{ThesisChapados2010,
4158 author = {Chapados, Nicolas},
4159 title = {Sequential Machine learning Approaches for Portfolio Management},
4160 year = {2010},
4161 school = {Universit{\'{e}} de Montr{\'{e}}al},
4162 abstract = {[English follow]
4163 Cette th{\`{e}}se envisage un ensemble de m{\'{e}}thodes permettant aux algorithmes d'apprentissage statistique de mieux traiter la nature s{\'{e}}quentielle des probl{\`{e}}mes de gestion de portefeuilles financiers. Nous d{\'{e}}butons par une consid{\'{e}}ration du probl{\`{e}}me g{\'{e}}n{\'{e}}ral de la composition d'algorithmes d'apprentissage devant g{\'{e}}rer des t{\^{a}}ches s{\'{e}}quentielles, en particulier celui de la mise-{\`{a}}-jour efficace des ensembles d'apprentissage dans un cadre de validation s{\'{e}}quentielle. Nous {\'{e}}num{\'{e}}rons les desiderata que des primitives de composition doivent satisfaire, et faisons ressortir la difficult{\'{e}} de les atteindre de fa{\c c}on rigoureuse et efficace. Nous poursuivons en pr{\'{e}}sentant un ensemble d'algorithmes qui atteignent ces objectifs et pr{\'{e}}sentons une {\'{e}}tude de cas d'un syst{\`{e}}me complexe de prise de d{\'{e}}cision financi{\`{e}}re utilisant ces techniques. Nous d{\'{e}}crivons ensuite une m{\'{e}}thode g{\'{e}}n{\'{e}}rale permettant de transformer un probl{\`{e}}me de d{\'{e}}cision s{\'{e}}quentielle non-Markovien en un probl{\`{e}}me d'apprentissage supervis{\'{e}} en employant un algorithme de recherche bas{\'{e}} sur les K meilleurs chemins. Nous traitons d'une application en gestion de portefeuille o{\`{u}} nous entra{\^{\i}}nons un algorithme d'apprentissage {\`{a}} optimiser directement un ratio de Sharpe (ou autre crit{\`{e}}re non-additif incorporant une aversion au risque). Nous illustrons l'approche par une {\'{e}}tude exp{\'{e}}rimentale approfondie, proposant une architecture de r{\'{e}}seaux de neurones sp{\'{e}}cialis{\'{e}}e {\`{a}} la gestion de portefeuille et la comparant {\`{a}} plusieurs alternatives. Finalement, nous introduisons une repr{\'{e}}sentation fonctionnelle de s{\'{e}}ries chronologiques permettant {\`{a}} des pr{\'{e}}visions d'{\^{e}}tre effectu{\'{e}}es sur un horizon variable, tout en utilisant un ensemble informationnel r{\'{e}}v{\'{e}}l{\'{e}} de mani{\`{e}}re progressive. L'approche est bas{\'{e}}e sur l'utilisation des processus Gaussiens, lesquels fournissent une matrice de covariance compl{\`{e}}te entre tous les points pour lesquels une pr{\'{e}}vision est demand{\'{e}}e. Cette information est utilis{\'{e}}e {\`{a}} bon escient par un algorithme qui transige activement des {\'{e}}carts de cours (price spreads) entre des contrats {\`{a}} terme sur commodit{\'{e}}s. L'approche propos{\'{e}}e produit, hors {\'{e}}chantillon, un rendement ajust{\'{e}} pour le risque significatif, apr{\`{e}}s frais de transactions, sur un portefeuille de 30 actifs.
4164 This thesis considers a number of approaches to make machine learning algorithms better suited to the sequential nature of financial portfolio management tasks. We start by considering the problem of the general composition of learning algorithms that must handle temporal learning tasks, in particular that of creating and efficiently updating the training sets in a sequential simulation framework. We enumerate the desiderata that composition primitives should satisfy, and underscore the difficulty of rigorously and efficiently reaching them. We follow by introducing a set of algorithms that accomplish the desired objectives, presenting a case-study of a real-world complex learning system for financial decision-making that uses those techniques. We then describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-best paths search algorithm. We consider an application in financial portfolio management where we train a learning algorithm to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating extensive experimental results using a neural network architecture specialized for portfolio management and compare against well-known alternatives. Finally, we introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.}
4005 } 4165 }
4006 4166
4007 @ARTICLE{Thierry+al-2008, 4167 @ARTICLE{Thierry+al-2008,
4008 author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul}, 4168 author = {Bertin-Mahieux, Thierry and Eck, Douglas and Maillet, Fran{\c c}ois and Lamere, Paul},
4009 title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases}, 4169 title = {Autotagger: A Model For Predicting Social Tags from Acoustic Features on Large Music Databases},
4322 @INPROCEEDINGS{Turian+al-2009, 4482 @INPROCEEDINGS{Turian+al-2009,
4323 author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua}, 4483 author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua},
4324 title = {Quadratic Features and Deep Architectures for Chunking}, 4484 title = {Quadratic Features and Deep Architectures for Chunking},
4325 booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)}, 4485 booktitle = {North American Chapter of the Association for Computational Linguistics - Human Language Technologies (NAACL HLT)},
4326 year = {2009}, 4486 year = {2009},
4487 pages = {245--248},
4488 publisher = {Association for Computational Linguistics},
4489 url = {http://www.aclweb.org/anthology/N/N09/N09-2062},
4327 abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.} 4490 abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
4328 } 4491 }
4329 4492
4330 @INPROCEEDINGS{Turian+al-2010, 4493 @INPROCEEDINGS{Turian+al-2010,
4331 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua and Roth, Dan}, 4494 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua and Roth, Dan},
4337 } 4500 }
4338 4501
4339 @INPROCEEDINGS{Turian+Ratinov+Bengio-2010, 4502 @INPROCEEDINGS{Turian+Ratinov+Bengio-2010,
4340 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua}, 4503 author = {Turian, Joseph and Ratinov, Lev and Bengio, Yoshua},
4341 title = {Word representations: A simple and general method for semi-supervised learning}, 4504 title = {Word representations: A simple and general method for semi-supervised learning},
4342 booktitle = {Association for Computational Linguistics(ACL2010)}, 4505 booktitle = {Proceedings of the 48th Annual Meeting of the Association for Computational Linguistics(ACL2010)},
4343 year = {2010} 4506 year = {2010},
4507 pages = {384--394},
4508 publisher = {Association for Computational Linguistics},
4344 } 4509 }
4345 4510
4346 @INPROCEEDINGS{Vincent-Bengio-2003, 4511 @INPROCEEDINGS{Vincent-Bengio-2003,
4347 author = {Vincent, Pascal and Bengio, Yoshua}, 4512 author = {Vincent, Pascal and Bengio, Yoshua},
4348 title = {Manifold Parzen Windows}, 4513 title = {Manifold Parzen Windows},
4349 year = {2003}, 4514 year = {2003},
4350 pages = {825--832}, 4515 pages = {825--832},
4351 crossref = {NIPS15-shorter}, 4516 crossref = {NIPS15-shorter},
4352 abstract = {The similarity between objects is a fundamental element of many learning algorithms. Most non-parametric methods take this similarity to be fixed, but much recent work has shown the advantages of learning it, in particular to exploit the local invariances in the data or to capture the possibly non-linear manifold on which most of the data lies. We propose a new non-parametric kernel density estimation method which captures the local structure of an underlying manifold through the leading eigenvectors of regularized local covariance matrices. Experiments in density estimation show significant improvements with respect to Parzen density estimators. The density estimators can also be used within Bayes classifiers, yielding classification rates similar to {SVM}s and much superior to the Parzen classifier.}, 4517 abstract = {The similarity between objects is a fundamental element of many learning algorithms. Most non-parametric methods take this similarity to be fixed, but much recent work has shown the advantages of learning it, in particular to exploit the local invariances in the data or to capture the possibly non-linear manifold on which most of the data lies. We propose a new non-parametric kernel density estimation method which captures the local structure of an underlying manifold through the leading eigenvectors of regularized local covariance matrices. Experiments in density estimation show significant improvements with respect to Parzen density estimators. The density estimators can also be used within Bayes classifiers, yielding classification rates similar to {SVM}s and much superior to the Parzen classifier.},
4353 topics={HighDimensional,Kernel,Unsupervised},cat={C}, 4518 topics={HighDimensional,Kernel,Unsupervised},cat={C},
4519 }
4520
4521 @ARTICLE{Vincent-JMLR-2010,
4522 author = {Vincent, Pascal and Larochelle, Hugo and Lajoie, Isabelle and Bengio, Yoshua and Manzagol, Pierre-Antoine},
4523 title = {Stacked Denoising Autoencoders: learning useful representations in a deep network with a local denoising criterion},
4524 journal = {JMLR},
4525 year = {2010},
4526 note = {to appear}
4354 } 4527 }
4355 4528
4356 @TECHREPORT{Vincent-TR1316, 4529 @TECHREPORT{Vincent-TR1316,
4357 author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine}, 4530 author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, Pierre-Antoine},
4358 title = {Extracting and Composing Robust Features with Denoising Autoencoders}, 4531 title = {Extracting and Composing Robust Features with Denoising Autoencoders},
4459 booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)}, 4632 booktitle = {Advances in Neural Information Processing Systems 8 (NIPS'95)},
4460 year = {-1}, 4633 year = {-1},
4461 publisher = {MIT Press} 4634 publisher = {MIT Press}
4462 } 4635 }
4463 4636
4637 @ARTICLE{JMLR,
4638 journal = {Journal of Machine Learning Research},
4639 year = {-1}
4640 }
4641
4464 @INPROCEEDINGS{NIPS19, 4642 @INPROCEEDINGS{NIPS19,
4465 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas}, 4643 editor = {{Sch{\"{o}}lkopf}, Bernhard and Platt, John and Hoffman, Thomas},
4466 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)}, 4644 title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4467 booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)}, 4645 booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
4468 year = {-1}, 4646 year = {-1},
4550 title = {Advances in Neural Information Processing Systems 17 (NIPS'04)}, 4728 title = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
4551 booktitle = {Advances in Neural Information Processing Systems 17 (NIPS'04)}, 4729 booktitle = {Advances in Neural Information Processing Systems 17 (NIPS'04)},
4552 year = {-1} 4730 year = {-1}
4553 } 4731 }
4554 4732
4555 @INPROCEEDINGS{ICML08, 4733 @PROCEEDINGS{ICML08,
4556 editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.}, 4734 editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
4557 title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)}, 4735 title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
4558 booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)}, 4736 booktitle = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)},
4559 year = {-1}, 4737 year = {2008},
4560 publisher = {ACM} 4738 publisher = {ACM}
4561 } 4739 }
4562 4740
4563 @INPROCEEDINGS{ICML07, 4741 @PROCEEDINGS{ICML07,
4564 editor = {Ghahramani, Zoubin}, 4742 editor = {Ghahramani, Zoubin},
4565 title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)}, 4743 title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
4566 booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)}, 4744 booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
4567 year = {-1}, 4745 year = {2007},
4568 publisher = {ACM} 4746 publisher = {ACM}
4569 } 4747 }
4570 4748
4571 @TECHREPORT{DIRO, 4749 @TECHREPORT{DIRO,
4572 title = {DIRO}, 4750 title = {DIRO},
4690 title = {ICML'99}, 4868 title = {ICML'99},
4691 booktitle = {ICML'99}, 4869 booktitle = {ICML'99},
4692 year = {-1}, 4870 year = {-1},
4693 publisher = {Morgan Kaufmann} 4871 publisher = {Morgan Kaufmann}
4694 } 4872 }
4873 @ARTICLE{JMLR-shorter,
4874 journal = {JMLR},
4875 year = {-1}
4876 }
4695 @INPROCEEDINGS{NIPS1-shorter, 4877 @INPROCEEDINGS{NIPS1-shorter,
4696 title = {NIPS'88}, 4878 title = {NIPS'88},
4697 booktitle = {NIPS 1}, 4879 booktitle = {NIPS 1},
4698 year = {-1}, 4880 year = {-1},
4699 publisher = {Morgan Kaufmann} 4881 publisher = {Morgan Kaufmann}
4824 @INPROCEEDINGS{xAISTATS2009-shorter, 5006 @INPROCEEDINGS{xAISTATS2009-shorter,
4825 title = {AISTATS'2009}, 5007 title = {AISTATS'2009},
4826 booktitle = {AISTATS'2009}, 5008 booktitle = {AISTATS'2009},
4827 year = {-1} 5009 year = {-1}
4828 } 5010 }
4829
4830