# HG changeset patch # User gdesjardins # Date 1283378351 14400 # Node ID 3977ecd49431679145177b2fb16f4ee9b1ebc793 # Parent d19e3cb809c194bf31d3ea4dcf54aa5e6ea4df69# Parent f82093bf4405713f974548173ac7ed54eb7b8f00 merge diff -r d19e3cb809c1 -r 3977ecd49431 doc/v2_planning.txt --- a/doc/v2_planning.txt Wed Sep 01 17:58:43 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,241 +0,0 @@ - -Motivation -========== - -Yoshua: -------- - -We are missing a *Theano Machine Learning library*. - -The deep learning tutorials do a good job but they lack the following features, which I would like to see in a ML library: - - - a well-organized collection of Theano symbolic expressions (formulas) for handling most of - what is needed either in implementing existing well-known ML and deep learning algorithms or - for creating new variants (without having to start from scratch each time), that is the - mathematical core, - - - a well-organized collection of python modules to help with the following: - - several data-access models that wrap around learning algorithms for interfacing with various types of data (static vectors, images, sound, video, generic time-series, etc.) - - generic utility code for optimization - - stochastic gradient descent variants - - early stopping variants - - interfacing to generic 2nd order optimization methods - - 2nd order methods tailored to work on minibatches - - optimizers for sparse coefficients / parameters - - generic code for model selection and hyper-parameter optimization (including the use and coordination of multiple jobs running on different machines, e.g. using jobman) - - generic code for performance estimation and experimental statistics - - visualization tools (using existing python libraries) and examples for all of the above - - learning algorithm conventions and meta-learning algorithms (bagging, boosting, mixtures of experts, etc.) which use them - - [Note that many of us already use some instance of all the above, but each one tends to reinvent the wheel and newbies don't benefit from a knowledge base.] - - - a well-documented set of python scripts using the above library to show how to run the most - common ML algorithms (possibly with examples showing how to run multiple experiments with - many different models and collect statistical comparative results). This is particularly - important for pure users to adopt Theano in the ML application work. - -Ideally, there would be one person in charge of this project, making sure a coherent and -easy-to-read design is developed, along with many helping hands (to implement the various -helper modules, formulae, and learning algorithms). - - -James: -------- - -I am interested in the design and implementation of the "well-organized collection of Theano -symbolic expressions..." - -I would like to explore algorithms for hyper-parameter optimization, following up on some -"high-throughput" work. I'm most interested in the "generic code for model selection and -hyper-parameter optimization..." and "generic code for performance estimation...". - -I have some experiences with the data-access requirements, and some lessons I'd like to share -on that, but no time to work on that aspect of things. - -I will continue to contribute to the "well-documented set of python scripts using the above to -showcase common ML algorithms...". I have an Olshausen&Field-style sparse coding script that -could be polished up. I am also implementing the mcRBM and I'll be able to add that when it's -done. - - - -Suggestions for how to tackle various desiderata -================================================ - - -Theano Symbolic Expressions for ML ----------------------------------- - -We could make this a submodule of pylearn: ``pylearn.nnet``. - -Yoshua: I would use a different name, e.g., "pylearn.formulas" to emphasize that it is not just -about neural nets, and that this is a collection of formulas (expressions), rather than -completely self-contained classes for learners. We could have a "nnet.py" file for -neural nets, though. - -There are a number of ideas floating around for how to handle classes / -modules (LeDeepNet, pylearn.shared.layers, pynnet, DeepAnn) so lets implement as much -math as possible in global functions with no classes. There are no models in -the wish list that require than a few vectors and matrices to parametrize. -Global functions are more reusable than classes. - - -Data access ------------ - -A general interface to datasets from the perspective of an experiment driver -(e.g. kfold) is to see them as a function that maps index (typically integer) -to example (whose type and nature depends on the dataset, it could for -instance be an (image, label) pair). This interface permits iterating over -the dataset, shuffling the dataset, and splitting it into folds. For -efficiency, it is nice if the dataset interface supports looking up several -index values at once, because looking up many examples at once can sometimes -be faster than looking each one up in turn. In particular, looking up -a consecutive block of indices, or a slice, should be well supported. - -Some datasets may not support random access (e.g. a random number stream) and -that's fine if an exception is raised. The user will see a NotImplementedError -or similar, and try something else. We might want to have a way to test -that a dataset is random-access or not without having to load an example. - - -A more intuitive interface for many datasets (or subsets) is to load them as -matrices or lists of examples. This format is more convenient to work with at -an ipython shell, for example. It is not good to provide only the "dataset -as a function" view of a dataset. Even if a dataset is very large, it is nice -to have a standard way to get some representative examples in a convenient -structure, to be able to play with them in ipython. - - -Another thing to consider related to datasets is that there are a number of -other efforts to have standard ML datasets, and we should be aware of them, -and compatible with them when it's easy: - - mldata.org (they have a file format, not sure how many use it) - - weka (ARFF file format) - - scikits.learn - - hdf5 / pytables - - -pylearn.datasets uses a DATA_ROOT environment variable to locate a filesystem -folder that is assumed to have a standard form across different installations. -That's where the data files are. The correct format of this folder is currently -defined implicitly by the contents of /data/lisa/data at DIRO, but it would be -better to document in pylearn what the contents of this folder should be as -much as possible. It should be possible to rebuild this tree from information -found in pylearn. - -Yoshua (about ideas proposed by Pascal Vincent a while ago): - - - we may want to distinguish between datasets and tasks: a task defines - not just the data but also things like what is the input and what is the - target (for supervised learning), and *importantly* a set of performance metrics - that make sense for this task (e.g. those used by papers solving a particular - task, or reported for a particular benchmark) - - - we should discuss about a few "standards" that datasets and tasks may comply to, such as - - "input" and "target" fields inside each example, for supervised or semi-supervised learning tasks - (with a convention for the semi-supervised case when only the input or only the target is observed) - - "input" for unsupervised learning - - conventions for missing-valued components inside input or target - - how examples that are sequences are treated (e.g. the input or the target is a sequence) - - how time-stamps are specified when appropriate (e.g., the sequences are asynchronous) - - how error metrics are specified - * example-level statistics (e.g. classification error) - * dataset-level statistics (e.g. ROC curve, mean and standard error of error) - - -Model Selection & Hyper-Parameter Optimization ----------------------------------------------- - -Driving a distributed computing job for a long time to optimize -hyper-parameters using one or more clusters is the goal here. -Although there might be some library-type code to write here, I think of this -more as an application template. The user would use python code to describe -the experiment to run and the hyper-parameter space to search. Then this -application-driver would take control of scheduling jobs and running them on -various computers... I'm imagining a potentially ugly brute of a hack that's -not necessarily something we will want to expose at a low-level for reuse. - -Yoshua: We want both the library-defined driver that takes instructions about how to generate -new hyper-parameter combinations (e.g. implicitly providing a prior distribution from which -to sample them), and examples showing how to use it in typical cases. -Note that sometimes we just want to find the best configuration of hyper-parameters, -but sometimes we want to do more subtle analysis. Often a combination of both. -In this respect it could be useful for the user to define hyper-parameters over -which scientific questions are sought (e.g. depth of an architecture) vs -hyper-parameters that we would like to marginalize/maximize over (e.g. learning rate). -This can influence both the sampling of configurations (we want to make sure that all -combinations of question-driving hyper-parameters are covered) and the analysis -of results (we may be willing to estimate ANOVAs or averaging or quantiles over -the non-question-driving hyper-parameters). - -Python scripts for common ML algorithms ---------------------------------------- - -The script aspect of this feature request makes me think that what would be -good here is more tutorial-type scripts. And the existing tutorials could -potentially be rewritten to use some of the pylearn.nnet expressions. More -tutorials / demos would be great. - -Yoshua: agreed that we could write them as tutorials, but note how the -spirit would be different from the current deep learning tutorials: we would -not mind using library code as much as possible instead of trying to flatten -out everything in the interest of pedagogical simplicity. Instead, these -tutorials should be meant to illustrate not the algorithms but *how to take -advantage of the library*. They could also be used as *BLACK BOX* implementations -by people who don't want to dig lower and just want to run experiments. - -Functional Specifications -========================= - -TODO: -Put these into different text files so that this one does not become a monster. -For each thing with a functional spec (e.g. datasets library, optimization library) make a -separate file. - - - -pylearn.formulas ----------------- - -Directory with functions for building layers, calculating classification -errors, cross-entropies with various distributions, free energies, etc. This -module would include for the most part global functions, Theano Ops and Theano -optimizations. - -Yoshua: I would break it down in module files, e.g.: - -pylearn.formulas.costs: generic / common cost functions, e.g. various cross-entropies, squared error, -abs. error, various sparsity penalties (L1, Student) - -pylearn.formulas.linear: formulas for linear classifier, linear regression, factor analysis, PCA - -pylearn.formulas.nnet: formulas for building layers of various kinds, various activation functions, -layers which could be plugged with various costs & penalties, and stacked - -pylearn.formulas.ae: formulas for auto-encoders and denoising auto-encoder variants - -pylearn.formulas.noise: formulas for corruption processes - -pylearn.formulas.rbm: energies, free energies, conditional distributions, Gibbs sampling - -pylearn.formulas.trees: formulas for decision trees - -pylearn.formulas.boosting: formulas for boosting variants - -etc. - -Fred: It seam that the DeepANN git repository by Xavier G. have part of this as function. - -Indexing Convention -~~~~~~~~~~~~~~~~~~~ - -Something to decide on - Fortran-style or C-style indexing. Although we have -often used c-style indexing in the past (for efficiency in c!) this is no -longer an issue with numpy because the physical layout is independent of the -indexing order. The fact remains that Fortran-style indexing follows linear -algebra conventions, while c-style indexing does not. If a global function -includes a lot of math derivations, it would be *really* nice if the code used -the same convention for the orientation of matrices, and endlessly annoying to -have to be always transposing everything. - diff -r d19e3cb809c1 -r 3977ecd49431 doc/v2_planning/dataset.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/dataset.txt Wed Sep 01 17:59:11 2010 -0400 @@ -0,0 +1,3 @@ +Discussion of Function Specification for Dataset Types +====================================================== + diff -r d19e3cb809c1 -r 3977ecd49431 doc/v2_planning/learner.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/learner.txt Wed Sep 01 17:59:11 2010 -0400 @@ -0,0 +1,78 @@ + +Discussion of Function Specification for Learner Types +====================================================== + +In its most abstract form, a learner is an object with the +following semantics: + +* A learner has named hyper-parameters that control how it learns (these can be viewed +as options of the constructor, or might be set directly by a user) + +* A learner also has an internal state that depends on what it has learned. + +* A learner reads and produces data, so the definition of learner is +intimately linked to the definition of dataset (and task). + +* A learner has one or more 'train' or 'adapt' functions by which +it is given a sample of data (typically either the whole training set, or +a mini-batch, which contains as a special case a single 'example'). Learners +interface with datasets in order to obtain data. These functions cause the +learner to change its internal state and take advantage to some extent +of the data provided. The 'train' function should take charge of +completely exploiting the dataset, as specified per the hyper-parameters, +so that it would typically be called only once. An 'adapt' function +is meant for learners that can operate in an 'online' setting where +data continually arrive and the control loop (when to stop) is to +be managed outside of it. For most intents and purposes, the +'train' function could also handle the 'online' case by providing +the controlled iterations over the dataset (which would then be +seen as a stream of examples). + * learner.train(dataset) + * learner.adapt(data) + +* Different types of learners can then exploit their internal state +in order to perform various computations after training is completed, +or in the middle of training, e.g., + + * y=learner.predict(x) + for learners that see (x,y) pairs during training and predict y given x, + or for learners that see only x's and learn a transformation of it (i.e. feature extraction). + Here and below, x and y are tensor-like objects whose first index iterates + over particular examples in a batch or minibatch of examples. + + * p=learner.probability(examples) + p=learner.log_probability(examples) + for learners that can estimate probability density or probability functions, + note that example could be a pair (x,y) for learners that expect each example + to represent such a pair. The second form is provided in case the example + is high-dimensional and computations in the log-domain are numerically preferable. + The first dimension of examples or of x and y is an index over a minibatch or a dataset. + + * p=learner.free_energy(x) + for learners that can estimate a log unnormalized probability; the output has the same length as the input. + + * c=learner.costs(examples) + returns a matrix of costs (one row per example, i.e., again the output has the same length + as the input), the first column of which represents the cost whose expectation + we wish to minimize over new samples from the unknown underlying data distribution. + + +Some learners may be able to handle x's and y's that contain missing values. + +* For convenience, some of these operations could be bundled, e.g. + + * [prediction,costs] = learner.predict_and_adapt((x,y)) + +* Some learners could include in their internal state not only what they +have learned but some information about recently seen examples that conditions +the expected distribution of upcoming examples. In that case, they might +be used, e.g. in an online setting as follows: + for (x,y) in data_stream: + [prediction,costs]=learner.predict((x,y)) + accumulate_statistics(prediction,costs) + +* In some cases, each example is itself a (possibly variable-size) sequence +or other variable-size object (e.g. an image, or a video) + + + diff -r d19e3cb809c1 -r 3977ecd49431 doc/v2_planning/main_plan.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/main_plan.txt Wed Sep 01 17:59:11 2010 -0400 @@ -0,0 +1,241 @@ + +Motivation +========== + +Yoshua: +------- + +We are missing a *Theano Machine Learning library*. + +The deep learning tutorials do a good job but they lack the following features, which I would like to see in a ML library: + + - a well-organized collection of Theano symbolic expressions (formulas) for handling most of + what is needed either in implementing existing well-known ML and deep learning algorithms or + for creating new variants (without having to start from scratch each time), that is the + mathematical core, + + - a well-organized collection of python modules to help with the following: + - several data-access models that wrap around learning algorithms for interfacing with various types of data (static vectors, images, sound, video, generic time-series, etc.) + - generic utility code for optimization + - stochastic gradient descent variants + - early stopping variants + - interfacing to generic 2nd order optimization methods + - 2nd order methods tailored to work on minibatches + - optimizers for sparse coefficients / parameters + - generic code for model selection and hyper-parameter optimization (including the use and coordination of multiple jobs running on different machines, e.g. using jobman) + - generic code for performance estimation and experimental statistics + - visualization tools (using existing python libraries) and examples for all of the above + - learning algorithm conventions and meta-learning algorithms (bagging, boosting, mixtures of experts, etc.) which use them + + [Note that many of us already use some instance of all the above, but each one tends to reinvent the wheel and newbies don't benefit from a knowledge base.] + + - a well-documented set of python scripts using the above library to show how to run the most + common ML algorithms (possibly with examples showing how to run multiple experiments with + many different models and collect statistical comparative results). This is particularly + important for pure users to adopt Theano in the ML application work. + +Ideally, there would be one person in charge of this project, making sure a coherent and +easy-to-read design is developed, along with many helping hands (to implement the various +helper modules, formulae, and learning algorithms). + + +James: +------- + +I am interested in the design and implementation of the "well-organized collection of Theano +symbolic expressions..." + +I would like to explore algorithms for hyper-parameter optimization, following up on some +"high-throughput" work. I'm most interested in the "generic code for model selection and +hyper-parameter optimization..." and "generic code for performance estimation...". + +I have some experiences with the data-access requirements, and some lessons I'd like to share +on that, but no time to work on that aspect of things. + +I will continue to contribute to the "well-documented set of python scripts using the above to +showcase common ML algorithms...". I have an Olshausen&Field-style sparse coding script that +could be polished up. I am also implementing the mcRBM and I'll be able to add that when it's +done. + + + +Suggestions for how to tackle various desiderata +================================================ + + +Theano Symbolic Expressions for ML +---------------------------------- + +We could make this a submodule of pylearn: ``pylearn.nnet``. + +Yoshua: I would use a different name, e.g., "pylearn.formulas" to emphasize that it is not just +about neural nets, and that this is a collection of formulas (expressions), rather than +completely self-contained classes for learners. We could have a "nnet.py" file for +neural nets, though. + +There are a number of ideas floating around for how to handle classes / +modules (LeDeepNet, pylearn.shared.layers, pynnet, DeepAnn) so lets implement as much +math as possible in global functions with no classes. There are no models in +the wish list that require than a few vectors and matrices to parametrize. +Global functions are more reusable than classes. + + +Data access +----------- + +A general interface to datasets from the perspective of an experiment driver +(e.g. kfold) is to see them as a function that maps index (typically integer) +to example (whose type and nature depends on the dataset, it could for +instance be an (image, label) pair). This interface permits iterating over +the dataset, shuffling the dataset, and splitting it into folds. For +efficiency, it is nice if the dataset interface supports looking up several +index values at once, because looking up many examples at once can sometimes +be faster than looking each one up in turn. In particular, looking up +a consecutive block of indices, or a slice, should be well supported. + +Some datasets may not support random access (e.g. a random number stream) and +that's fine if an exception is raised. The user will see a NotImplementedError +or similar, and try something else. We might want to have a way to test +that a dataset is random-access or not without having to load an example. + + +A more intuitive interface for many datasets (or subsets) is to load them as +matrices or lists of examples. This format is more convenient to work with at +an ipython shell, for example. It is not good to provide only the "dataset +as a function" view of a dataset. Even if a dataset is very large, it is nice +to have a standard way to get some representative examples in a convenient +structure, to be able to play with them in ipython. + + +Another thing to consider related to datasets is that there are a number of +other efforts to have standard ML datasets, and we should be aware of them, +and compatible with them when it's easy: + - mldata.org (they have a file format, not sure how many use it) + - weka (ARFF file format) + - scikits.learn + - hdf5 / pytables + + +pylearn.datasets uses a DATA_ROOT environment variable to locate a filesystem +folder that is assumed to have a standard form across different installations. +That's where the data files are. The correct format of this folder is currently +defined implicitly by the contents of /data/lisa/data at DIRO, but it would be +better to document in pylearn what the contents of this folder should be as +much as possible. It should be possible to rebuild this tree from information +found in pylearn. + +Yoshua (about ideas proposed by Pascal Vincent a while ago): + + - we may want to distinguish between datasets and tasks: a task defines + not just the data but also things like what is the input and what is the + target (for supervised learning), and *importantly* a set of performance metrics + that make sense for this task (e.g. those used by papers solving a particular + task, or reported for a particular benchmark) + + - we should discuss about a few "standards" that datasets and tasks may comply to, such as + - "input" and "target" fields inside each example, for supervised or semi-supervised learning tasks + (with a convention for the semi-supervised case when only the input or only the target is observed) + - "input" for unsupervised learning + - conventions for missing-valued components inside input or target + - how examples that are sequences are treated (e.g. the input or the target is a sequence) + - how time-stamps are specified when appropriate (e.g., the sequences are asynchronous) + - how error metrics are specified + * example-level statistics (e.g. classification error) + * dataset-level statistics (e.g. ROC curve, mean and standard error of error) + + +Model Selection & Hyper-Parameter Optimization +---------------------------------------------- + +Driving a distributed computing job for a long time to optimize +hyper-parameters using one or more clusters is the goal here. +Although there might be some library-type code to write here, I think of this +more as an application template. The user would use python code to describe +the experiment to run and the hyper-parameter space to search. Then this +application-driver would take control of scheduling jobs and running them on +various computers... I'm imagining a potentially ugly brute of a hack that's +not necessarily something we will want to expose at a low-level for reuse. + +Yoshua: We want both the library-defined driver that takes instructions about how to generate +new hyper-parameter combinations (e.g. implicitly providing a prior distribution from which +to sample them), and examples showing how to use it in typical cases. +Note that sometimes we just want to find the best configuration of hyper-parameters, +but sometimes we want to do more subtle analysis. Often a combination of both. +In this respect it could be useful for the user to define hyper-parameters over +which scientific questions are sought (e.g. depth of an architecture) vs +hyper-parameters that we would like to marginalize/maximize over (e.g. learning rate). +This can influence both the sampling of configurations (we want to make sure that all +combinations of question-driving hyper-parameters are covered) and the analysis +of results (we may be willing to estimate ANOVAs or averaging or quantiles over +the non-question-driving hyper-parameters). + +Python scripts for common ML algorithms +--------------------------------------- + +The script aspect of this feature request makes me think that what would be +good here is more tutorial-type scripts. And the existing tutorials could +potentially be rewritten to use some of the pylearn.nnet expressions. More +tutorials / demos would be great. + +Yoshua: agreed that we could write them as tutorials, but note how the +spirit would be different from the current deep learning tutorials: we would +not mind using library code as much as possible instead of trying to flatten +out everything in the interest of pedagogical simplicity. Instead, these +tutorials should be meant to illustrate not the algorithms but *how to take +advantage of the library*. They could also be used as *BLACK BOX* implementations +by people who don't want to dig lower and just want to run experiments. + +Functional Specifications +========================= + +TODO: +Put these into different text files so that this one does not become a monster. +For each thing with a functional spec (e.g. datasets library, optimization library) make a +separate file. + + + +pylearn.formulas +---------------- + +Directory with functions for building layers, calculating classification +errors, cross-entropies with various distributions, free energies, etc. This +module would include for the most part global functions, Theano Ops and Theano +optimizations. + +Yoshua: I would break it down in module files, e.g.: + +pylearn.formulas.costs: generic / common cost functions, e.g. various cross-entropies, squared error, +abs. error, various sparsity penalties (L1, Student) + +pylearn.formulas.linear: formulas for linear classifier, linear regression, factor analysis, PCA + +pylearn.formulas.nnet: formulas for building layers of various kinds, various activation functions, +layers which could be plugged with various costs & penalties, and stacked + +pylearn.formulas.ae: formulas for auto-encoders and denoising auto-encoder variants + +pylearn.formulas.noise: formulas for corruption processes + +pylearn.formulas.rbm: energies, free energies, conditional distributions, Gibbs sampling + +pylearn.formulas.trees: formulas for decision trees + +pylearn.formulas.boosting: formulas for boosting variants + +etc. + +Fred: It seam that the DeepANN git repository by Xavier G. have part of this as function. + +Indexing Convention +~~~~~~~~~~~~~~~~~~~ + +Something to decide on - Fortran-style or C-style indexing. Although we have +often used c-style indexing in the past (for efficiency in c!) this is no +longer an issue with numpy because the physical layout is independent of the +indexing order. The fact remains that Fortran-style indexing follows linear +algebra conventions, while c-style indexing does not. If a global function +includes a lot of math derivations, it would be *really* nice if the code used +the same convention for the orientation of matrices, and endlessly annoying to +have to be always transposing everything. + diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/algorithms/mcRBM.py --- a/pylearn/algorithms/mcRBM.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Wed Sep 01 17:59:11 2010 -0400 @@ -5,7 +5,10 @@ Modeling pixel means and covariances using factored third-order Boltzmann machines. IEEE Conference on Computer Vision and Pattern Recognition. -and performs one of the experiments on CIFAR-10 discussed in that paper. +and performs one of the experiments on CIFAR-10 discussed in that paper. There are some minor +discrepancies between the paper and the accompanying code (train_mcRBM.py), and the +accompanying code has been taken to be correct in those cases because I couldn't get things to +work otherwise. Math @@ -24,25 +27,71 @@ -Full Energy of mean and Covariance RBM, with +Version in paper +---------------- + +Full Energy of the Mean and Covariance RBM, with :math:`h_k = h_k^{(c)}`, :math:`g_j = h_j^{(m)}`, :math:`b_k = b_k^{(c)}`, :math:`c_j = b_j^{(m)}`, :math:`U_{if} = C_{if}`, -: + E (v, h, g) = + - 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i (U_{if} v_i) / |U_{.f}|*|v| )^2 + - \sum_k b_k h_k + + 0.5 \sum_i v_i^2 + - \sum_j \sum_i W_{ij} g_j v_i + - \sum_j c_j g_j + +For the energy function to correspond to a probability distribution, P must be non-positive. P +is initialized to be a diagonal, and in our experience it can be left as such because even in +the paper it has a very low learning rate, and is only allowed to be updated after the filters +in U are learned (in effect). + +Version in published train_mcRBM code +------------------------------------- + +The train_mcRBM file implements learning in a similar but technically different Energy function: E (v, h, g) = - - 0.5 \sum_f \sum_k P_{fk} h_k ( \sum_i U_{if} v_i )^2 / |U_{*f}|^2 |v|^2 + - 0.5 \sum_f \sum_k P_{fk} h_k (\sum_i U_{if} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2 - \sum_k b_k h_k + 0.5 \sum_i v_i^2 - \sum_j \sum_i W_{ij} g_j v_i - \sum_j c_j g_j -For the energy function to correspond to a probability distribution, P must be non-positive. +There are two differences with respect to the paper: + + - 'v' is not normalized by its length, but rather it is normalized to have length close to + the square root of the number of its components. The variable called 'small' that + "avoids division by zero" is orders larger than machine precision, and is on the order of + the normalized sum-of-squares, so I've included it in the Energy function. + + - 'U' is also not normalized by its length. U is initialized to have columns that are + shorter than unit-length (approximately 0.2 with the 105 principle components in the + train_mcRBM data). During training, the columns of U are constrained manually to have + equal lengths (see the use of normVF), but Euclidean norm is allowed to change. During + learning it quickly converges towards 1 and then exceeds 1. It does not seem like this + column-wise normalization of U is justified by maximum-likelihood, I have no intuition + for why it is used. +Version in this code +-------------------- + +This file implements the same algorithm as the train_mcRBM code, except that the P matrix is +omitted for clarity, and replaced analytically with a negative identity matrix. + + E (v, h, g) = + + 0.5 \sum_k h_k (\sum_i U_{ik} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2 + - \sum_k b_k h_k + + 0.5 \sum_i v_i^2 + - \sum_j \sum_i W_{ij} g_j v_i + - \sum_j c_j g_j + + + Conventions in this file ======================== @@ -58,16 +107,20 @@ K covariance variables. The mcRBM is parametrized by 5 variables: - - `P`, a matrix (probably sparse) of pooling (F x K) - `U`, a matrix whose rows are visible covariance directions (I x F) - `W`, a matrix whose rows are visible mean directions (I x J) - `b`, a vector of hidden covariance biases (K) - `c`, a vector of hidden mean biases (J) -Matrices are generally layed out according to a C-order convention. +Matrices are generally layed out and accessed according to a C-order convention. """ +# +# WORKING NOTES +# THIS DERIVATION IS BASED ON THE ** PAPER ** ENERGY FUNCTION +# NOT THE ENERGY FUNCTION IN THE CODE!!! +# # Free energy is the marginal energy of visible units # Recall: # Q(x) = exp(-E(x))/Z ==> -log(Q(x)) - log(Z) = E(x) @@ -137,92 +190,162 @@ # + 0.5 \sum_i v_i^2 # - \sum_i a_i v_i -import sys -import logging +import sys, os, logging import numpy as np import numpy + +import theano from theano import function, shared, dot from theano import tensor as TT -import theano.sparse #installs the sparse shared var handler floatX = theano.config.floatX +import pylearn +#TODO: clean up the HMC_sampler code +#TODO: think of naming convention for acronyms + suffix? from pylearn.sampling.hmc import HMC_sampler from pylearn.io import image_tiling - -from sparse_coding import numpy_project_onto_ball +from pylearn.gd.sgd import sgd_updates +import pylearn.dataset_ops.image_patches -print >> sys.stderr, "mcRBM IS NOT READY YET" - +########################################### +# +# Candidates for factoring +# +########################################### -#TODO: This should be in the nnet part of the library -def sgd_updates(params, grads, lr): - try: - float(lr) - lr = [lr for p in params] - except TypeError: - pass - updates = [(p, p - plr * gp) for (plr, p, gp) in zip(lr, params, grads)] - return updates +#TODO: Document, move to pylearn's math lib +def l1(X): + return abs(X).sum() + +#TODO: Document, move to pylearn's math lib +def l2(X): + return TT.sqrt((X**2).sum()) + +#TODO: Document, move to pylearn's math lib +def contrastive_cost(free_energy_fn, pos_v, neg_v): + return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum() -def as_shared(x, name=None, dtype=floatX): - if hasattr(x, 'type'): - return x - else: - if 'float' in str(x.dtype): - return shared(x.astype(floatX), name=name) - else: - return shared(x, name=name) +#TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to +# block gradient going through neg_v +def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0): + """ + :param pos_v: positive-phase sample of visible units + :param neg_v: negative-phase sample of visible units + """ + #block the grad through neg_v + cost=contrastive_cost(free_energy_fn, pos_v, neg_v) + if other_cost: + cost = cost + other_cost + return theano.tensor.grad(cost, + wrt=params, + consider_constant=[neg_v]) -def hidden_cov_units_preactivation_given_v(rbm, v, small=1e-8): +########################################### +# +# Expressions that are mcRBM-specific +# +########################################### + +#TODO: make global function to initialize parameter + +def hidden_cov_units_preactivation_given_v(rbm, v, small=0.5): + """Return argument to the sigmoid that would give mean of covariance hid units + + See the math at the top of this file for what 'adjusted' means. + + return b - 0.5 * dot(adjusted(v), U)**2 + """ (U,W,a,b,c) = rbm - unit_v = v / (TT.sqrt(TT.sum(v**2, axis=1))+small).dimshuffle(0,'x') # unit rows - unit_U = U # assuming unit cols! - #unit_U = U / (TT.sqrt(TT.sum(U**2, axis=0))+small) #unit cols - return b - 0.5 * dot(unit_v, unit_U)**2 + unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm + return b - 0.5 * dot(unit_v, U)**2 -def free_energy_given_v(rbm, v): - """Returns theano expression for free energy of visible vector `v` in an mcRBM - - An mcRBM is parametrized - by `U`, `W`, `b`, `c`. - See module - level documentation for explanations of the `U`, `W`, `b` and `c` parameters. +def free_energy_terms_given_v(rbm, v): + """Returns theano expression for the terms that are added to form the free energy of + visible vector `v` in an mcRBM. - - The free energy of v is what we need for learning and hybrid Monte-carlo negative-phase - sampling. - + 1. Free energy related to covariance hiddens + 2. Free energy related to mean hiddens + 3. Free energy related to L2-Norm of `v` + 4. Free energy related to projection of `v` onto biases `a` """ U, W, a, b, c = rbm - t0 = -TT.sum(TT.nnet.softplus(hidden_cov_units_preactivation_given_v(rbm, v)),axis=1) t1 = -TT.sum(TT.nnet.softplus(c + dot(v,W)), axis=1) t2 = 0.5 * TT.sum(v**2, axis=1) t3 = -TT.dot(v, a) - return t0 + t1 + t2 + t3, (t0, t1, t2, t3) + return [t0, t1, t2, t3] -def expected_h_g_given_v(P, U, W, b, c, v): - """Returns theano expression conditional expectations (`h`, `g`) in an mcRBM. - - An mcRBM is parametrized - by `U`, `W`, `b`, `c`. - See module - level documentation for explanations of the `U`, `W`, `b` and `c` parameters. - +def free_energy_given_v(rbm, v): + """Returns theano expression for free energy of visible vector `v` in an mcRBM + """ + return sum(free_energy_terms_given_v(rbm,v)) - The conditional E[h, g | v] is what we need to classify images. - """ - raise NotImplementedError() +def expected_h_g_given_v(rbm, v): + """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM. - #TODO: check to see if these args should be negated? - - if P is None: - h = nnet.sigmoid(b + 0.5 * cosines(v,U)) - else: - h = nnet.sigmoid(b + 0.5 * dot(cosines(v,U), P)) + `h` is the conditional on the covariance units. + `g` is the conditional on the mean units. + + """ + (U, W, a, b, c) = rbm + h = TT.nnet.sigmoid(hidden_cov_units_preactivation_given_v(rbm, v)) g = nnet.sigmoid(c + dot(v,W)) return (h, g) +def n_visible_units(rbm): + """Return the number of visible units of this RBM + + For an RBM made from shared variables, this will return an integer, + for a purely symbolic RBM this will return a theano expression. + + """ + W = rbm[1] + try: + return W.value.shape[0] + except AttributeError: + return W.shape[0] + +def sampler(rbm, n_particles, n_visible=None, rng=7823748): + """Return an `HMC_sampler` that will draw samples from the distribution over visible + units specified by this RBM. + + :param n_particles: this many parallel chains will be simulated. + :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation. + """ + if not hasattr(rng, 'randn'): + rng = np.random.RandomState(rng) + if n_visible is None: + n_visible = n_visible_units(rbm) + rval = HMC_sampler( + positions = [shared( + rng.randn( + n_particles, + n_visible).astype(floatX), + name='particles')], + energy_fn = lambda p : free_energy_given_v(rbm, p[0]), + seed=int(rng.randint(2**30))) + return rval + +############################# +# +# Convenient data container +# +############################# + class MeanCovRBM(object): - """Container for mcRBM parameters that gives more convenient access to mcRBM methods. + """Container for mcRBM parameters + + It provides parameter lookup by name, as well as a heuristic for initializing the + parameters for effective learning. + + Attributes: + + - U - the covariance filters (theano shared variable) + - W - the mean filters (theano shared variable) + - a - the visible bias (theano shared variable) + - b - the covariance bias (theano shared variable) + - c - the mean bias (theano shared variable) + """ params = property(lambda s: [s.U, s.W, s.a, s.b, s.c]) @@ -230,51 +353,40 @@ n_visible = property(lambda s: s.W.value.shape[0]) def __init__(self, U, W, a, b, c): - self.U = as_shared(U, 'U') - self.W = as_shared(W, 'W') - self.a = as_shared(a, 'a') - self.b = as_shared(b, 'b') - self.c = as_shared(c, 'c') + self.__dict__.update(locals()) + del self.__dict__['self'] - assert self.b.type.dtype == 'float32' + def __getitem__(self, idx): + # support unpacking of this container as if it were a tuple + return self.params[idx] @classmethod - def new_from_dims(cls, - n_I, # input dimensionality - n_K, # number of covariance hidden units - n_F, # number of covariance filters (squared) - n_J, # number of mean filters (linear) - seed = 8923402190, - ): + def new_from_dims(cls, n_I, n_K, n_J, rng = 8923402190): """ Return a MeanCovRBM instance with randomly-initialized parameters. + + :param n_I: input dimensionality + :param n_K: number of covariance hidden units + :param n_J: number of mean filters (linear) + :param rng: seed or numpy RandomState object to initialize params """ - - - if 0: - if P_init == 'diag': - if n_K != n_F: - raise ValueError('cannot use diagonal initialization of non-square P matrix') - import scipy.sparse - P = -scipy.sparse.identity(n_K).tocsr() - else: - raise NotImplementedError() + if not hasattr(rng, 'randn'): + rng = np.random.RandomState(rng) - rng = np.random.RandomState(seed) + def shrd(X,name): + return shared(X.astype(floatX), name=name) - # initialization taken from Marc'Aurelio - + # initialization taken from train_mcRBM.py return cls( - #U = numpy_project_onto_ball(rng.randn(n_I, n_F).T).T, - U = 0.2 * rng.randn(n_I, n_F), - W = rng.randn(n_I, n_J)/np.sqrt((n_I+n_J)/2), - a = np.ones(n_I)*(-2), - b = np.ones(n_K)*2, - c = np.zeros(n_J),) + U = shrd(0.02 * rng.randn(n_I, n_K),'U'), + W = shrd(0.05 * rng.randn(n_I, n_J),'W'), + a = shrd(np.ones(n_I)*(0),'a'), + b = shrd(np.ones(n_K)*2,'b'), + c = shrd(np.ones(n_J)*(-2),'c')) def __getstate__(self): - # unpack shared containers, which may have references to Theano stuff - # and are not a long-term stable data type. + # unpack shared containers, which may have references to Theano stuff and are not a + # long-term stable data type. return dict( U = self.U.value, W = self.W.value, @@ -282,526 +394,17 @@ c = self.c.value) def __setstate__(self, dct): - self.__init__(**dct) # calls as_shared on pickled arrays - - def hmc_sampler(self, n_particles=100, seed=7823748): - return HMC_sampler( - positions = [as_shared( - np.random.RandomState(seed^20893).randn( - n_particles, - self.n_visible ))], - energy_fn = lambda p : self.free_energy_given_v(p[0]), - seed=seed) - - def free_energy_given_v(self, v, extra=False): - rval = free_energy_given_v(self.params, v) - if extra: - return rval - else: - return rval[0] - - def contrastive_gradient(self, pos_v, neg_v, U_l1_penalty=0, W_l1_penalty=0): - """Return a list of gradient expressions for self.params - - :param pos_v: positive-phase sample of visible units - :param neg_v: negative-phase sample of visible units - """ - pos_FE = self.free_energy_given_v(pos_v) - neg_FE = self.free_energy_given_v(neg_v) - - gpos_FE = theano.tensor.grad(pos_FE.sum(), self.params) - gneg_FE = theano.tensor.grad(neg_FE.sum(), self.params) - rval = [ gp - gn for (gp,gn) in zip(gpos_FE, gneg_FE)] - rval[0] = rval[0] - TT.sign(self.U)*U_l1_penalty - rval[1] = rval[1] - TT.sign(self.W)*W_l1_penalty - return rval - -from pylearn.dataset_ops.protocol import TensorFnDataset -from pylearn.dataset_ops.memo import memo -import scipy.io -@memo -def load_mcRBM_demo_patches(): - d = scipy.io.loadmat('/u/bergstrj/cvs/articles/2010/spike_slab_RBM/src/marcaurelio/training_colorpatches_16x16_demo.mat') - totnumcases = d["whitendata"].shape[0] - #d = d["whitendata"][0:np.floor(totnumcases/batch_size)*batch_size,:].copy() - d = d["whitendata"].copy() - return d - - -if __name__ == '__main__': - - print >> sys.stderr, "TODO: use P matrix (aka FH matrix)" - - dataset='MAR' - if dataset == 'MAR': - R,C= 21,5 - n_patches=10240 - demodata = scipy.io.loadmat('/u/bergstrj/cvs/articles/2010/spike_slab_RBM/src/marcaurelio/training_colorpatches_16x16_demo.mat') - else: - R,C= 16,16 # the size of image patches - n_patches=100000 - - n_train_iters=30000 - - n_burnin_steps=10000 - - l1_penalty=1e-3 - no_l1_epochs = 10 - effective_l1_penalty=0.0 - - epoch_size=50000 - batchsize = 128 - lr = 0.075 / batchsize - s_lr = TT.scalar() - s_l1_penalty=TT.scalar() - n_K=256 - n_F=256 - n_J=100 - - rbm = MeanCovRBM.new_from_dims(n_I=R*C, - n_K=n_K, - n_J=n_J, - n_F=n_F, - ) - - sampler = rbm.hmc_sampler(n_particles=batchsize) - - def l2(X): - return (X**2).sum() - def tile(X, fname): - if dataset == 'MAR': - X = np.dot(X, demodata['invpcatransf'].T) - R=16 - C=16 - #X = X.reshape((X.shape[0], 3, 16, 16)).transpose([0,2,3,1]).copy() - X = (X[:,:256], X[:,256:512], X[:,512:], None) - _img = image_tiling.tile_raster_images(X, - img_shape=(R,C), - min_dynamic_range=1e-2) - image_tiling.save_tiled_raster_images(_img, fname) - #print "Burning in..." - #for burnin in xrange(n_burnin_steps): - #sampler.simulate() - - if 0: - print "Just SAMPLING..." - for jj in xrange(n_burnin_steps): - if 0 == jj % 100: - tile(sampler.positions[0].value, "sampler_%06i.png"%jj) - tile(numpy.random.randn(100, 105), "random_%06i.png"%jj) - print "burning in... ", jj - sys.stdout.flush() - sampler.simulate() - - sys.exit() - - batch_idx = TT.iscalar() - - if 0: - from pylearn.dataset_ops import image_patches - train_batch = image_patches.image_patches( - s_idx = (batch_idx * batchsize + np.arange(batchsize)), - dims = (n_patches,R,C), - center=True, - unitvar=True, - dtype=floatX, - rasterized=True) - else: - op = TensorFnDataset(floatX, - bcast=(False,), - fn=load_mcRBM_demo_patches, - single_shape=(105,)) - train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches) - - imgs_fn = function([batch_idx], outputs=train_batch) - - grads = rbm.contrastive_gradient( - pos_v=train_batch, - neg_v=sampler.positions[0], - U_l1_penalty=s_l1_penalty, - W_l1_penalty=s_l1_penalty) - - learn_fn = function([batch_idx, s_lr, s_l1_penalty], - outputs=[ - grads[0].norm(2), - rbm.free_energy_given_v(train_batch).sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(), - ], - updates = sgd_updates( - rbm.params, - grads, - lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])) - theano.printing.pydotprint(learn_fn, 'learn_fn.png') - - print "Learning..." - normVF=1 - for jj in xrange(n_train_iters): - - print_jj = ((1 and jj < 100) - or (0 and jj < 100 and 0==jj%10) - or (jj < 1000 and 0==jj%100) - or (1 and jj < 10000 and 0==jj%1000)) - - - if print_jj: - tile(imgs_fn(jj), "imgs_%06i.png"%jj) - tile(sampler.positions[0].value, "sample_%06i.png"%jj) - tile(rbm.U.value.T, "U_%06i.png"%jj) - tile(rbm.W.value.T, "W_%06i.png"%jj) - - print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), - print 'l2(U)', l2(rbm.U.value), - print 'l2(W)', l2(rbm.W.value), - print 'U min max', rbm.U.value.min(), rbm.U.value.max(), - print 'W min max', rbm.W.value.min(), rbm.W.value.max(), - print 'a min max', rbm.a.value.min(), rbm.a.value.max(), - print 'b min max', rbm.b.value.min(), rbm.b.value.max(), - print 'c min max', rbm.c.value.min(), rbm.c.value.max(), - - print 'parts min', sampler.positions[0].value.min(), - print 'max',sampler.positions[0].value.max(), - print 'HMC step', sampler.stepsize, - print 'arate', sampler.avg_acceptance_rate - - sampler.simulate() - - l2_of_Ugrad = learn_fn(jj, - lr/max(1, jj/(20*epoch_size/batchsize)), - effective_l1_penalty) - - if print_jj: - print 'l2(gU)', float(l2_of_Ugrad[0]), - print 'FE+', float(l2_of_Ugrad[1]), - print 'FE+[0]', float(l2_of_Ugrad[2]), - print 'FE+[1]', float(l2_of_Ugrad[3]), - print 'FE+[2]', float(l2_of_Ugrad[4]), - print 'FE+[3]', float(l2_of_Ugrad[5]), - - if jj == no_l1_epochs * epoch_size/batchsize: - print "Activating L1 weight decay" - effective_l1_penalty = 1e-3 - - if 0: - rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T - else: - # weird normalization technique... - # It constrains all the columns of the matrix to have the same length - # But the matrix itself is re-scaled to have an arbitrary abslute size. - U = rbm.U.value - U_norms = np.sqrt((U*U).sum(axis=0)) - assert len(U_norms) == n_F - normVF = .95 * normVF + .05 * np.mean(U_norms) - rbm.U.value = rbm.U.value * normVF/U_norms + d = dict(dct) + for key in ['U', 'W', 'a', 'b', 'c']: + d[key] = shared(d[key], name=key) + self.__init__(**d) -# -# -# Marc'Aurelio Ranzato's code -# -###################################################################### -# compute the value of the free energy at a given input -# F = - sum log(1+exp(- .5 FH (VF data/norm(data))^2 + bias_cov)) +... -# - sum log(1+exp(w_mean data + bias_mean)) + ... -# - bias_vis data + 0.5 data^2 -# NOTE: FH is constrained to be positive -# (in the paper the sign is negative but the sign in front of it is also flipped) -def compute_energy_mcRBM(data,normdata,vel,energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis): - # normalize input data vectors - data.mult(data, target = t6) # DxP (nr input dims x nr samples) - t6.sum(axis = 0, target = lengthsq) # 1xP - lengthsq.mult(0.5, target = energy) # energy of quadratic regularization term - lengthsq.mult(1./num_vis) # normalize by number of components (like std) - - lengthsq.add(small) # small prevents division by 0 - # energy_j = \sum_i 0.5 data_ij ^2 - # lengthsq_j = 1/ (\sum_i data_ij ^2 + small) - cmt.sqrt(lengthsq, target = length) - # length_j = sqrt(lengthsq_j) - length.reciprocal(target = normcoeff) # 1xP - # normcoef_j = 1/sqrt(lengthsq_j) - data.mult_by_row(normcoeff, target = normdata) # normalized data - # normdata is like data, but cols have unit L2 norm - - ## potential - # covariance contribution - cmt.dot(VF.T, normdata, target = feat) # HxP (nr factors x nr samples) - feat.mult(feat, target = featsq) # HxP - - # featsq is the squared cosines (VF with data) - cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples) - t1.mult(-0.5) - t1.add_col_vec(bias_cov) # OxP - cmt.exp(t1) # OxP - t1.add(1, target = t2) # OxP - cmt.log(t2) - t2.mult(-1) - energy.add_sums(t2, axis=0) - # mean contribution - cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples) - feat_mean.add_col_vec(bias_mean) # HxP - cmt.exp(feat_mean) - feat_mean.add(1) - cmt.log(feat_mean) - feat_mean.mult(-1) - energy.add_sums(feat_mean, axis=0) - # visible bias term - data.mult_by_col(bias_vis, target = t6) - t6.mult(-1) # DxP - energy.add_sums(t6, axis=0) # 1xP - # kinetic - vel.mult(vel, target = t6) - energy.add_sums(t6, axis = 0, mult = .5) +#TODO: put the normalization of U as a global function -###################################################### -# mcRBM trainer: sweeps over the training set. -# For each batch of samples compute derivatives to update the parameters -# at the training samples and at the negative samples drawn calling HMC sampler. -def train_mcRBM(): - - config = ConfigParser() - config.read('input_configuration') - - verbose = config.getint('VERBOSITY','verbose') - - num_epochs = config.getint('MAIN_PARAMETER_SETTING','num_epochs') - batch_size = config.getint('MAIN_PARAMETER_SETTING','batch_size') - startFH = config.getint('MAIN_PARAMETER_SETTING','startFH') - startwd = config.getint('MAIN_PARAMETER_SETTING','startwd') - doPCD = config.getint('MAIN_PARAMETER_SETTING','doPCD') - - # model parameters - num_fac = config.getint('MODEL_PARAMETER_SETTING','num_fac') - num_hid_cov = config.getint('MODEL_PARAMETER_SETTING','num_hid_cov') - num_hid_mean = config.getint('MODEL_PARAMETER_SETTING','num_hid_mean') - apply_mask = config.getint('MODEL_PARAMETER_SETTING','apply_mask') - - # load data - data_file_name = config.get('DATA','data_file_name') - d = loadmat(data_file_name) # input in the format PxD (P vectorized samples with D dimensions) - totnumcases = d["whitendata"].shape[0] - d = d["whitendata"][0:floor(totnumcases/batch_size)*batch_size,:].copy() - totnumcases = d.shape[0] - num_vis = d.shape[1] - num_batches = int(totnumcases/batch_size) - dev_dat = cmt.CUDAMatrix(d.T) # VxP - - # training parameters - epsilon = config.getfloat('OPTIMIZER_PARAMETERS','epsilon') - epsilonVF = 2*epsilon - epsilonFH = 0.02*epsilon - epsilonb = 0.02*epsilon - epsilonw_mean = 0.2*epsilon - epsilonb_mean = 0.1*epsilon - weightcost_final = config.getfloat('OPTIMIZER_PARAMETERS','weightcost_final') - - # HMC setting - hmc_step_nr = config.getint('HMC_PARAMETERS','hmc_step_nr') - hmc_step = 0.01 - hmc_target_ave_rej = config.getfloat('HMC_PARAMETERS','hmc_target_ave_rej') - hmc_ave_rej = hmc_target_ave_rej - - # initialize weights - VF = cmt.CUDAMatrix(np.array(0.02 * np.random.randn(num_vis, num_fac), dtype=np.float32, order='F')) # VxH - if apply_mask == 0: - FH = cmt.CUDAMatrix( np.array( np.eye(num_fac,num_hid_cov), dtype=np.float32, order='F') ) # HxO - else: - dd = loadmat('your_FHinit_mask_file.mat') # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example - FH = cmt.CUDAMatrix( np.array( dd["FH"], dtype=np.float32, order='F') ) - bias_cov = cmt.CUDAMatrix( np.array(2.0*np.ones((num_hid_cov, 1)), dtype=np.float32, order='F') ) - bias_vis = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F') ) - w_mean = cmt.CUDAMatrix( np.array( 0.05 * np.random.randn(num_vis, num_hid_mean), dtype=np.float32, order='F') ) # VxH - bias_mean = cmt.CUDAMatrix( np.array( -2.0*np.ones((num_hid_mean,1)), dtype=np.float32, order='F') ) - - # initialize variables to store derivatives - VFinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_fac)), dtype=np.float32, order='F')) - FHinc = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F')) - bias_covinc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_cov, 1)), dtype=np.float32, order='F')) - bias_visinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F')) - w_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_hid_mean)), dtype=np.float32, order='F')) - bias_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_mean, 1)), dtype=np.float32, order='F')) - # initialize temporary storage - data = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - normdata = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - negdataini = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - feat = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) - featsq = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) - negdata = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F')) - old_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) - new_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) - gradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - normgradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - thresh = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) - feat_mean = cmt.CUDAMatrix( np.array(np.empty((num_hid_mean, batch_size)), dtype=np.float32, order='F')) - vel = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F')) - length = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP - lengthsq = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP - normcoeff = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP - if apply_mask==1: # this used to constrain very large FH matrices only allowing to change values in a neighborhood - dd = loadmat('your_FHinit_mask_file.mat') - mask = cmt.CUDAMatrix( np.array(dd["mask"], dtype=np.float32, order='F')) - normVF = 1 - small = 0.5 - - # other temporary vars - t1 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F')) - t2 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F')) - t3 = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) - t4 = cmt.CUDAMatrix( np.array(np.empty((1,batch_size)), dtype=np.float32, order='F')) - t5 = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F')) - t6 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) - t7 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) - t8 = cmt.CUDAMatrix( np.array(np.empty((num_vis, num_fac)), dtype=np.float32, order='F')) - t9 = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F')) - t10 = cmt.CUDAMatrix( np.array(np.empty((1,num_fac)), dtype=np.float32, order='F')) - t11 = cmt.CUDAMatrix( np.array(np.empty((1,num_hid_cov)), dtype=np.float32, order='F')) - - # start training - for epoch in range(num_epochs): - - print "Epoch " + str(epoch + 1) - - # anneal learning rates - epsilonVFc = epsilonVF/max(1,epoch/20) - epsilonFHc = epsilonFH/max(1,epoch/20) - epsilonbc = epsilonb/max(1,epoch/20) - epsilonw_meanc = epsilonw_mean/max(1,epoch/20) - epsilonb_meanc = epsilonb_mean/max(1,epoch/20) - weightcost = weightcost_final - - if epoch <= startFH: - epsilonFHc = 0 - if epoch <= startwd: - weightcost = 0 - - for batch in range(num_batches): - - # get current minibatch - data = dev_dat.slice(batch*batch_size,(batch + 1)*batch_size) # DxP (nr dims x nr samples) +#TODO: put the learning loop as a global function or class, so that someone could load and *TRAIN* an mcRBM!!! - # normalize input data - data.mult(data, target = t6) # DxP - t6.sum(axis = 0, target = lengthsq) # 1xP - lengthsq.mult(1./num_vis) # normalize by number of components (like std) - lengthsq.add(small) # small avoids division by 0 - cmt.sqrt(lengthsq, target = length) - length.reciprocal(target = normcoeff) # 1xP - data.mult_by_row(normcoeff, target = normdata) # normalized data - ## compute positive sample derivatives - # covariance part - cmt.dot(VF.T, normdata, target = feat) # HxP (nr facs x nr samples) - feat.mult(feat, target = featsq) # HxP - cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples) - t1.mult(-0.5) - t1.add_col_vec(bias_cov) # OxP - t1.apply_sigmoid(target = t2) # OxP - cmt.dot(featsq, t2.T, target = FHinc) # HxO - cmt.dot(FH,t2, target = t3) # HxP - t3.mult(feat) - cmt.dot(normdata, t3.T, target = VFinc) # VxH - t2.sum(axis = 1, target = bias_covinc) - bias_covinc.mult(-1) - # visible bias - data.sum(axis = 1, target = bias_visinc) - bias_visinc.mult(-1) - # mean part - cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples) - feat_mean.add_col_vec(bias_mean) # HxP - feat_mean.apply_sigmoid() # HxP - feat_mean.mult(-1) - cmt.dot(data, feat_mean.T, target = w_meaninc) - feat_mean.sum(axis = 1, target = bias_meaninc) - - # HMC sampling: draw an approximate sample from the model - if doPCD == 0: # CD-1 (set negative data to current training samples) - hmc_step, hmc_ave_rej = draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis) - else: # PCD-1 (use previous negative data as starting point for chain) - negdataini.assign(negdata) - hmc_step, hmc_ave_rej = draw_HMC_samples(negdataini,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis) - - # compute derivatives at the negative samples - # normalize input data - negdata.mult(negdata, target = t6) # DxP - t6.sum(axis = 0, target = lengthsq) # 1xP - lengthsq.mult(1./num_vis) # normalize by number of components (like std) - lengthsq.add(small) - cmt.sqrt(lengthsq, target = length) - length.reciprocal(target = normcoeff) # 1xP - negdata.mult_by_row(normcoeff, target = normdata) # normalized data - # covariance part - cmt.dot(VF.T, normdata, target = feat) # HxP - feat.mult(feat, target = featsq) # HxP - cmt.dot(FH.T,featsq, target = t1) # OxP - t1.mult(-0.5) - t1.add_col_vec(bias_cov) # OxP - t1.apply_sigmoid(target = t2) # OxP - FHinc.subtract_dot(featsq, t2.T) # HxO - FHinc.mult(0.5) - cmt.dot(FH,t2, target = t3) # HxP - t3.mult(feat) - VFinc.subtract_dot(normdata, t3.T) # VxH - bias_covinc.add_sums(t2, axis = 1) - # visible bias - bias_visinc.add_sums(negdata, axis = 1) - # mean part - cmt.dot(w_mean.T, negdata, target = feat_mean) # HxP - feat_mean.add_col_vec(bias_mean) # HxP - feat_mean.apply_sigmoid() # HxP - w_meaninc.add_dot(negdata, feat_mean.T) - bias_meaninc.add_sums(feat_mean, axis = 1) - - # update parameters - VFinc.add_mult(VF.sign(), weightcost) # L1 regularization - VF.add_mult(VFinc, -epsilonVFc/batch_size) - # normalize columns of VF: normalize by running average of their norm - VF.mult(VF, target = t8) - t8.sum(axis = 0, target = t10) - cmt.sqrt(t10) - t10.sum(axis=1,target = t5) - t5.copy_to_host() - normVF = .95*normVF + (.05/num_fac) * t5.numpy_array[0,0] # estimate norm - t10.reciprocal() - VF.mult_by_row(t10) - VF.mult(normVF) - bias_cov.add_mult(bias_covinc, -epsilonbc/batch_size) - bias_vis.add_mult(bias_visinc, -epsilonbc/batch_size) - - if epoch > startFH: - FHinc.add_mult(FH.sign(), weightcost) # L1 regularization - FH.add_mult(FHinc, -epsilonFHc/batch_size) # update - # set to 0 negative entries in FH - FH.greater_than(0, target = t9) - FH.mult(t9) - if apply_mask==1: - FH.mult(mask) - # normalize columns of FH: L1 norm set to 1 in each column - FH.sum(axis = 0, target = t11) - t11.reciprocal() - FH.mult_by_row(t11) - w_meaninc.add_mult(w_mean.sign(),weightcost) - w_mean.add_mult(w_meaninc, -epsilonw_meanc/batch_size) - bias_mean.add_mult(bias_meaninc, -epsilonb_meanc/batch_size) - - if verbose == 1: - print "VF: " + '%3.2e' % VF.euclid_norm() + ", DVF: " + '%3.2e' % (VFinc.euclid_norm()*(epsilonVFc/batch_size)) + ", FH: " + '%3.2e' % FH.euclid_norm() + ", DFH: " + '%3.2e' % (FHinc.euclid_norm()*(epsilonFHc/batch_size)) + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm() + ", Dbias_cov: " + '%3.2e' % (bias_covinc.euclid_norm()*(epsilonbc/batch_size)) + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm() + ", Dbias_vis: " + '%3.2e' % (bias_visinc.euclid_norm()*(epsilonbc/batch_size)) + ", wm: " + '%3.2e' % w_mean.euclid_norm() + ", Dwm: " + '%3.2e' % (w_meaninc.euclid_norm()*(epsilonw_meanc/batch_size)) + ", bm: " + '%3.2e' % bias_mean.euclid_norm() + ", Dbm: " + '%3.2e' % (bias_meaninc.euclid_norm()*(epsilonb_meanc/batch_size)) + ", step: " + '%3.2e' % hmc_step + ", rej: " + '%3.2e' % hmc_ave_rej - sys.stdout.flush() - # back-up every once in a while - if np.mod(epoch,10) == 0: - VF.copy_to_host() - FH.copy_to_host() - bias_cov.copy_to_host() - w_mean.copy_to_host() - bias_mean.copy_to_host() - bias_vis.copy_to_host() - savemat("ws_temp", {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array,'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch}) - # final back-up - VF.copy_to_host() - FH.copy_to_host() - bias_cov.copy_to_host() - bias_vis.copy_to_host() - w_mean.copy_to_host() - bias_mean.copy_to_host() - savemat("ws_fac" + str(num_fac) + "_cov" + str(num_hid_cov) + "_mean" + str(num_hid_mean), {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch}) +if __name__ == '__main__': + import pylearn.algorithms.tests.test_mcRBM + pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True) diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/algorithms/tests/__init__.py diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/algorithms/tests/test_mcRBM.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/tests/test_mcRBM.py Wed Sep 01 17:59:11 2010 -0400 @@ -0,0 +1,169 @@ + + +from pylearn.algorithms.mcRBM import * + + +def test_reproduce_ranzato_hinton_2010(dataset='MAR', as_unittest=True): + dataset='MAR' + if dataset == 'MAR': + n_vis=105 + n_patches=10240 + else: + R,C= 16,16 # the size of image patches + n_vis=R*C + n_patches=100000 + + n_train_iters=5000 + + n_burnin_steps=10000 + + l1_penalty=1e-3 + no_l1_epochs = 10 + effective_l1_penalty=0.0 + + epoch_size=n_patches + batchsize = 128 + lr = 0.075 / batchsize + s_lr = TT.scalar() + s_l1_penalty=TT.scalar() + n_K=256 + n_J=100 + + rbm = MeanCovRBM.new_from_dims(n_I=n_vis, n_K=n_K, n_J=n_J) + + smplr = sampler(rbm, n_particles=batchsize) + + def l2(X): + return numpy.sqrt((X**2).sum()) + if dataset == 'MAR': + tile = pylearn.dataset_ops.image_patches.save_filters_of_ranzato_hinton_2010 + else: + def tile(X, fname): + _img = image_tiling.tile_raster_images(X, + img_shape=(R,C), + min_dynamic_range=1e-2) + image_tiling.save_tiled_raster_images(_img, fname) + + batch_idx = TT.iscalar() + + if dataset == 'MAR': + train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_idx * batchsize + np.arange(batchsize)) + else: + train_batch = pylearn.dataset_ops.image_patches.image_patches( + s_idx = (batch_idx * batchsize + np.arange(batchsize)), + dims = (n_patches,R,C), + center=True, + unitvar=True, + dtype=floatX, + rasterized=True) + + if not as_unittest: + imgs_fn = function([batch_idx], outputs=train_batch) + + grads = contrastive_grad( + free_energy_fn=lambda v: free_energy_given_v(rbm, v), + pos_v=train_batch, + neg_v=smplr.positions[0], + params=list(rbm), + other_cost=(l1(rbm.U)+l1(rbm.W)) * s_l1_penalty) + sgd_ups = sgd_updates( + rbm.params, + grads, + stepsizes=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ]) + learn_fn = function([batch_idx, s_lr, s_l1_penalty], + outputs=[ + grads[0].norm(2), + (sgd_ups[0][1] - sgd_ups[0][0]).norm(2), + (sgd_ups[1][1] - sgd_ups[1][0]).norm(2), + ], + updates = sgd_ups) + + print "Learning..." + normVF=1 + last_epoch = -1 + for jj in xrange(n_train_iters): + epoch = jj*batchsize / epoch_size + + print_jj = epoch != last_epoch + last_epoch = epoch + + if epoch > 10: + break + + if as_unittest and epoch == 5: + U = rbm.U.value + W = rbm.W.value + def allclose(a,b): + return numpy.allclose(a,b,rtol=1.01,atol=1e-3) + print "" + print "--------------" + print "assert allclose(l2(U), %f)"%l2(U) + print "assert allclose(l2(W), %f)"%l2(W) + print "assert allclose(U.min(), %f)"%U.min() + print "assert allclose(U.max(), %f)"%U.max() + print "assert allclose(W.min(),%f)"%W.min() + print "assert allclose(W.max(), %f)"%W.max() + print "--------------" + + assert allclose(l2(U), 21.351664) + assert allclose(l2(W), 6.275828) + assert allclose(U.min(), -1.176703) + assert allclose(U.max(), 0.859802) + assert allclose(W.min(),-0.223128) + assert allclose(W.max(), 0.227558 ) + + break + + if print_jj: + if not as_unittest: + tile(imgs_fn(jj), "imgs_%06i.png"%jj) + tile(smplr.positions[0].value, "sample_%06i.png"%jj) + tile(rbm.U.value.T, "U_%06i.png"%jj) + tile(rbm.W.value.T, "W_%06i.png"%jj) + + print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize) + + print 'l2(U)', l2(rbm.U.value), + print 'l2(W)', l2(rbm.W.value) + + print 'U min max', rbm.U.value.min(), rbm.U.value.max(), + print 'W min max', rbm.W.value.min(), rbm.W.value.max(), + print 'a min max', rbm.a.value.min(), rbm.a.value.max(), + print 'b min max', rbm.b.value.min(), rbm.b.value.max(), + print 'c min max', rbm.c.value.min(), rbm.c.value.max() + + print 'parts min', smplr.positions[0].value.min(), + print 'max',smplr.positions[0].value.max(), + print 'HMC step', smplr.stepsize, + print 'arate', smplr.avg_acceptance_rate + + # Continue HMC chain + smplr.simulate() + + # Do CD update + l2_of_Ugrad = learn_fn(jj, + lr/max(1, jj/(20*epoch_size/batchsize)), + effective_l1_penalty) + + if print_jj: + print 'l2(U_grad)', float(l2_of_Ugrad[0]), + print 'l2(U_inc)', float(l2_of_Ugrad[1]), + print 'l2(W_inc)', float(l2_of_Ugrad[2]), + #print 'FE+', float(l2_of_Ugrad[2]), + #print 'FE+[0]', float(l2_of_Ugrad[3]), + #print 'FE+[1]', float(l2_of_Ugrad[4]), + #print 'FE+[2]', float(l2_of_Ugrad[5]), + #print 'FE+[3]', float(l2_of_Ugrad[6]) + + if jj == no_l1_epochs * epoch_size/batchsize: + print "Activating L1 weight decay" + effective_l1_penalty = 1e-3 + + # weird normalization technique... + # It constrains all the columns of the matrix to have the same length + # But the matrix itself is re-scaled to have an arbitrary abslute size. + U = rbm.U.value + U_norms = np.sqrt((U*U).sum(axis=0)) + assert len(U_norms) == n_K + normVF = .95 * normVF + .05 * np.mean(U_norms) + rbm.U.value = rbm.U.value * normVF/U_norms diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/dataset_ops/glviewer.py --- a/pylearn/dataset_ops/glviewer.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/dataset_ops/glviewer.py Wed Sep 01 17:59:11 2010 -0400 @@ -155,6 +155,7 @@ # dimensions global window + self.print_key_help() glutInit(sys.argv) # Select type of Display mode: @@ -366,6 +367,22 @@ elif args[0] == ESCAPE or args[0]=='q': sys.exit() + def print_key_help(self): + print "Program controls:" + print " q: quit" + print "" + print "Example controls:" + print " 0: reset to example 0" + print " j: next" + print " k: prev" + print "" + print "Frame controls (for movies)" + print " ): reset to frame 0" + print " J: forward" + print " K: backward" + print "Hint: Hold keys down for continuous play" + print "" + if __name__ == '__main__': diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/dataset_ops/image_patches.py --- a/pylearn/dataset_ops/image_patches.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/dataset_ops/image_patches.py Wed Sep 01 17:59:11 2010 -0400 @@ -2,12 +2,16 @@ import theano from pylearn.datasets.image_patches import ( + data_root, olshausen_field_1996_whitened_images, extract_random_patches) from .protocol import TensorFnDataset # protocol.py __init__.py from .memo import memo +import scipy.io +from pylearn.io import image_tiling + @memo def get_dataset(N,R,C,dtype,center,unitvar): seed=98234 @@ -48,3 +52,60 @@ return x + + +@memo +def ranzato_hinton_2010(path=None): + if path is None: + path = os.path.join(data_root(), 'image_patches', 'mcRBM', + 'training_colorpatches_16x16_demo.mat') + dct = scipy.io.loadmat(path) + return dct +def ranzato_hinton_2010_whitened_patches(path=None): + """Return the pca of the data, which is 10240 x 105 + """ + dct = ranzato_hinton_2010(path) + return dct['whitendata'].astype('float32') + +def undo_pca_filters_of_ranzato_hinton_2010(X, path=None): + """Return tuple (R,G,B,None) of matrices for matrix `X` of filters (one per row) + + Return value can be passed to `image_tiling.tile_raster_images`. + """ + dct = ranzato_hinton_2010(path) + X = numpy.dot(X, dct['invpcatransf'].T) + return (X[:,:256], X[:,256:512], X[:,512:], None) + +def save_filters_of_ranzato_hinton_2010(X, fname, min_dynamic_range=1e-3, data_path=None): + _img = image_tiling.tile_raster_images( + undo_pca_filters_of_ranzato_hinton_2010(X, path=data_path), + img_shape=(16,16), + min_dynamic_range=min_dynamic_range) + image_tiling.save_tiled_raster_images(_img, fname) + +def ranzato_hinton_2010_op(s_idx, + split='train', + dtype=theano.config.floatX, rasterized=True, + center=True, + unitvar=True): + N = 10240 + + if split != 'train': + raise NotImplementedError('train/test/valid splits for randomly sampled image patches?') + + if not rasterized: + # the data is provided as PCA-sphered, so rasterizing does not make sense + # TODO: add a param to enable/disable 'PCA', and if disabled, then consider + # rasterizing or not + raise NotImplementedError('only pca data is provided') + + if dtype != 'float32': + raise NotImplementedError('dtype not float32') + + op = TensorFnDataset(dtype, + bcast=(False,), + fn=ranzato_hinton_2010_whitened_patches, + single_shape=(105,)) + x = op(s_idx%N) + return x + diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/datasets/image_patches.py --- a/pylearn/datasets/image_patches.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/datasets/image_patches.py Wed Sep 01 17:59:11 2010 -0400 @@ -111,3 +111,4 @@ return rval + diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/gd/sgd.py --- a/pylearn/gd/sgd.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/gd/sgd.py Wed Sep 01 17:59:11 2010 -0400 @@ -1,8 +1,27 @@ -"""A stochastic gradient descent minimizer. (Possibly the simplest minimizer.) +"""A stochastic gradient descent minimizer. """ import theano +def sgd_updates(params, grads, stepsizes): + """Return a list of (pairs) that can be used as updates in theano.function to implement + stochastic gradient descent. + + :param params: variables to adjust in order to minimize some cost + :type params: a list of variables (theano.function will require shared variables) + :param grads: the gradient on each param (with respect to some cost) + :type grads: list of theano expressions + :param stepsizes: step by this amount times the negative gradient on each iteration + :type stepsizes: [symbolic] scalar or list of one [symbolic] scalar per param + """ + try: + iter(stepsizes) + except: + stepsizes = [stepsizes for p in params] + updates = [(p, p - step * gp) for (step, p, gp) in zip(stepsizes, params, grads)] + return updates + + class StochasticGradientDescent(theano.Module): """Fixed stepsize gradient descent diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/io/image_tiling.py --- a/pylearn/io/image_tiling.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/io/image_tiling.py Wed Sep 01 17:59:11 2010 -0400 @@ -87,9 +87,16 @@ for tile_col in xrange(tile_shape[1]): if tile_row * tile_shape[1] + tile_col < X.shape[0]: if scale_rows_to_unit_interval: - this_img = scale_to_unit_interval( - X[tile_row * tile_shape[1] + tile_col].reshape(img_shape), - eps=min_dynamic_range) + try: + this_img = scale_to_unit_interval( + X[tile_row * tile_shape[1] + tile_col].reshape(img_shape), + eps=min_dynamic_range) + except ValueError: + raise ValueError('Failed to reshape array of shape %s to shape %s' + % ( + X[tile_row*tile_shape[1] + tile_col].shape + , img_shape + )) else: this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) out_array[ diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/sandbox/train_mcRBM.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/sandbox/train_mcRBM.py Wed Sep 01 17:59:11 2010 -0400 @@ -0,0 +1,472 @@ +""" +This is a copy of mcRBM training that James modified to print out more information, visualize +filters, etc. Once mcRBM is stable, it can be deleted. +""" +# mcRBM training +# Refer to Ranzato and Hinton CVPR 2010 "Modeling Pixel Means and Covariances Using Factorized Third-Order BMs" +# +# Marc'Aurelio Ranzato +# 28 July 2010 + +import sys +import numpy as np +import cudamat as cmt +from scipy.io import loadmat, savemat +#import gpu_lock # put here you locking system package, if any +from ConfigParser import * + +demodata = None + +from pylearn.io import image_tiling +def tile(X, fname): + X = np.dot(X, demodata['invpcatransf'].T) + R=16 + C=16 + X = (X[:,:256], X[:,256:512], X[:,512:], None) + #X = (X[:,0::3], X[:,1::3], X[:,2::3], None) + _img = image_tiling.tile_raster_images(X, + img_shape=(R,C), + min_dynamic_range=1e-2) + image_tiling.save_tiled_raster_images(_img, fname) + +def save_imshow(X, fname): + image_tiling.Image.fromarray( + (image_tiling.scale_to_unit_interval(X)*255).astype('uint8'), + 'L').save(fname) + +###################################################################### +# compute the value of the free energy at a given input +# F = - sum log(1+exp(- .5 FH (VF data/norm(data))^2 + bias_cov)) +... +# - sum log(1+exp(w_mean data + bias_mean)) + ... +# - bias_vis data + 0.5 data^2 +# NOTE: FH is constrained to be positive +# (in the paper the sign is negative but the sign in front of it is also flipped) +def compute_energy_mcRBM(data,normdata,vel,energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis): + # normalize input data vectors + data.mult(data, target = t6) # DxP (nr input dims x nr samples) + t6.sum(axis = 0, target = lengthsq) # 1xP + lengthsq.mult(0.5, target = energy) # energy of quadratic regularization term + lengthsq.mult(1./num_vis) # normalize by number of components (like std) + lengthsq.add(small) # small prevents division by 0 + cmt.sqrt(lengthsq, target = length) + length.reciprocal(target = normcoeff) # 1xP + data.mult_by_row(normcoeff, target = normdata) # normalized data + ## potential + # covariance contribution + cmt.dot(VF.T, normdata, target = feat) # HxP (nr factors x nr samples) + feat.mult(feat, target = featsq) # HxP + cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples) + t1.mult(-0.5) + t1.add_col_vec(bias_cov) # OxP + cmt.exp(t1) # OxP + t1.add(1, target = t2) # OxP + cmt.log(t2) + t2.mult(-1) + energy.add_sums(t2, axis=0) + # mean contribution + cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples) + feat_mean.add_col_vec(bias_mean) # HxP + cmt.exp(feat_mean) + feat_mean.add(1) + cmt.log(feat_mean) + feat_mean.mult(-1) + energy.add_sums(feat_mean, axis=0) + # visible bias term + data.mult_by_col(bias_vis, target = t6) + t6.mult(-1) # DxP + energy.add_sums(t6, axis=0) # 1xP + # kinetic + vel.mult(vel, target = t6) + energy.add_sums(t6, axis = 0, mult = .5) + +################################################################# +# compute the derivative if the free energy at a given input +def compute_gradient_mcRBM(data,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis): + # normalize input data + data.mult(data, target = t6) # DxP + t6.sum(axis = 0, target = lengthsq) # 1xP + lengthsq.mult(1./num_vis) # normalize by number of components (like std) + lengthsq.add(small) + cmt.sqrt(lengthsq, target = length) + length.reciprocal(target = normcoeff) # 1xP + data.mult_by_row(normcoeff, target = normdata) # normalized data + cmt.dot(VF.T, normdata, target = feat) # HxP + feat.mult(feat, target = featsq) # HxP + cmt.dot(FH.T,featsq, target = t1) # OxP + t1.mult(-.5) + t1.add_col_vec(bias_cov) # OxP + t1.apply_sigmoid(target = t2) # OxP + cmt.dot(FH,t2, target = t3) # HxP + t3.mult(feat) + cmt.dot(VF, t3, target = normgradient) # VxP + # final bprop through normalization + length.mult(lengthsq, target = normcoeff) + normcoeff.reciprocal() # 1xP + normgradient.mult(data, target = gradient) # VxP + gradient.sum(axis = 0, target = t4) # 1xP + t4.mult(-1./num_vis) + data.mult_by_row(t4, target = gradient) + normgradient.mult_by_row(lengthsq, target = t6) + gradient.add(t6) + gradient.mult_by_row(normcoeff) + # add quadratic term gradient + gradient.add(data) + # add visible bias term + gradient.add_col_mult(bias_vis, -1) + # add MEAN contribution to gradient + cmt.dot(w_mean.T, data, target = feat_mean) # HxP + feat_mean.add_col_vec(bias_mean) # HxP + feat_mean.apply_sigmoid() # HxP + gradient.subtract_dot(w_mean,feat_mean) # VxP + +############################################################3 +# Hybrid Monte Carlo sampler +def draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis): + vel.fill_with_randn() + negdata.assign(data) + compute_energy_mcRBM(negdata,normdata,vel,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis) + compute_gradient_mcRBM(negdata,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis) + # half step + vel.add_mult(gradient, -0.5*hmc_step) + negdata.add_mult(vel,hmc_step) + # full leap-frog steps + for ss in range(hmc_step_nr - 1): + ## re-evaluate the gradient + compute_gradient_mcRBM(negdata,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis) + # update variables + vel.add_mult(gradient, -hmc_step) + negdata.add_mult(vel,hmc_step) + # final half-step + compute_gradient_mcRBM(negdata,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis) + vel.add_mult(gradient, -0.5*hmc_step) + # compute new energy + compute_energy_mcRBM(negdata,normdata,vel,new_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis) + # rejecton + old_energy.subtract(new_energy, target = thresh) + cmt.exp(thresh) + t4.fill_with_rand() + t4.less_than(thresh) + # update negdata and rejection rate + t4.mult(-1) + t4.add(1) # now 1's detect rejections + t4.sum(axis = 1, target = t5) + t5.copy_to_host() + rej = t5.numpy_array[0,0]/batch_size + data.mult_by_row(t4, target = t6) + negdata.mult_by_row(t4, target = t7) + negdata.subtract(t7) + negdata.add(t6) + hmc_ave_rej = 0.9*hmc_ave_rej + 0.1*rej + if hmc_ave_rej < hmc_target_ave_rej: + hmc_step = min(hmc_step*1.01,0.25) + else: + hmc_step = max(hmc_step*0.99,.001) + return hmc_step, hmc_ave_rej + + +###################################################### +# mcRBM trainer: sweeps over the training set. +# For each batch of samples compute derivatives to update the parameters +# at the training samples and at the negative samples drawn calling HMC sampler. +def train_mcRBM(): + + config = ConfigParser() + config.read('input_configuration') + + verbose = config.getint('VERBOSITY','verbose') + + num_epochs = config.getint('MAIN_PARAMETER_SETTING','num_epochs') + batch_size = config.getint('MAIN_PARAMETER_SETTING','batch_size') + startFH = config.getint('MAIN_PARAMETER_SETTING','startFH') + startwd = config.getint('MAIN_PARAMETER_SETTING','startwd') + doPCD = config.getint('MAIN_PARAMETER_SETTING','doPCD') + + # model parameters + num_fac = config.getint('MODEL_PARAMETER_SETTING','num_fac') + num_hid_cov = config.getint('MODEL_PARAMETER_SETTING','num_hid_cov') + num_hid_mean = config.getint('MODEL_PARAMETER_SETTING','num_hid_mean') + apply_mask = config.getint('MODEL_PARAMETER_SETTING','apply_mask') + + # load data + data_file_name = config.get('DATA','data_file_name') + d = loadmat(data_file_name) # input in the format PxD (P vectorized samples with D dimensions) + global demodata + demodata = d + totnumcases = d["whitendata"].shape[0] + d = d["whitendata"][0:np.floor(totnumcases/batch_size)*batch_size,:].copy() + totnumcases = d.shape[0] + num_vis = d.shape[1] + num_batches = int(totnumcases/batch_size) + dev_dat = cmt.CUDAMatrix(d.T) # VxP + + tile(d[:100], "100_whitened_data.png") + + # training parameters + epsilon = config.getfloat('OPTIMIZER_PARAMETERS','epsilon') + epsilonVF = 2*epsilon + epsilonFH = 0.02*epsilon + epsilonb = 0.02*epsilon + epsilonw_mean = 0.2*epsilon + epsilonb_mean = 0.1*epsilon + weightcost_final = config.getfloat('OPTIMIZER_PARAMETERS','weightcost_final') + + # HMC setting + hmc_step_nr = config.getint('HMC_PARAMETERS','hmc_step_nr') + hmc_step = 0.01 + hmc_target_ave_rej = config.getfloat('HMC_PARAMETERS','hmc_target_ave_rej') + hmc_ave_rej = hmc_target_ave_rej + + # initialize weights + VF = cmt.CUDAMatrix(np.array(0.02 * np.random.randn(num_vis, num_fac), dtype=np.float32, order='F')) # VxH + if apply_mask == 0: + FH = cmt.CUDAMatrix( np.array( np.eye(num_fac,num_hid_cov), dtype=np.float32, order='F') ) # HxO + else: + dd = loadmat('your_FHinit_mask_file.mat') # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example + FH = cmt.CUDAMatrix( np.array( dd["FH"], dtype=np.float32, order='F') ) + bias_cov = cmt.CUDAMatrix( np.array(2.0*np.ones((num_hid_cov, 1)), dtype=np.float32, order='F') ) + bias_vis = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F') ) + w_mean = cmt.CUDAMatrix( np.array( 0.05 * np.random.randn(num_vis, num_hid_mean), dtype=np.float32, order='F') ) # VxH + bias_mean = cmt.CUDAMatrix( np.array( -2.0*np.ones((num_hid_mean,1)), dtype=np.float32, order='F') ) + + # initialize variables to store derivatives + VFinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_fac)), dtype=np.float32, order='F')) + FHinc = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F')) + bias_covinc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_cov, 1)), dtype=np.float32, order='F')) + bias_visinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F')) + w_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_hid_mean)), dtype=np.float32, order='F')) + bias_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_mean, 1)), dtype=np.float32, order='F')) + + # initialize temporary storage + data = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP + normdata = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP + negdataini = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP + feat = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) + featsq = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) + negdata = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F')) + old_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) + new_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) + gradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP + normgradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP + thresh = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) + feat_mean = cmt.CUDAMatrix( np.array(np.empty((num_hid_mean, batch_size)), dtype=np.float32, order='F')) + vel = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F')) + length = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP + lengthsq = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP + normcoeff = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP + if apply_mask==1: # this used to constrain very large FH matrices only allowing to change values in a neighborhood + dd = loadmat('your_FHinit_mask_file.mat') + mask = cmt.CUDAMatrix( np.array(dd["mask"], dtype=np.float32, order='F')) + normVF = 1 + small = 0.5 + + # other temporary vars + t1 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F')) + t2 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F')) + t3 = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) + t4 = cmt.CUDAMatrix( np.array(np.empty((1,batch_size)), dtype=np.float32, order='F')) + t5 = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F')) + t6 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) + t7 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) + t8 = cmt.CUDAMatrix( np.array(np.empty((num_vis, num_fac)), dtype=np.float32, order='F')) + t9 = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F')) + t10 = cmt.CUDAMatrix( np.array(np.empty((1,num_fac)), dtype=np.float32, order='F')) + t11 = cmt.CUDAMatrix( np.array(np.empty((1,num_hid_cov)), dtype=np.float32, order='F')) + # start training + for epoch in range(num_epochs): + + def print_stuff(): + print "VF: " + '%3.2e' % VF.euclid_norm() \ + + ", DVF: " + '%3.2e' % (VFinc.euclid_norm()*(epsilonVFc/batch_size))\ + + ", VF_inc: " + '%3.2e' % (VFinc.euclid_norm())\ + + ", FH: " + '%3.2e' % FH.euclid_norm() \ + + ", DFH: " + '%3.2e' % (FHinc.euclid_norm()*(epsilonFHc/batch_size)) \ + + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm() \ + + ", Dbias_cov: " + '%3.2e' % (bias_covinc.euclid_norm()*(epsilonbc/batch_size)) \ + + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm() \ + + ", Dbias_vis: " + '%3.2e' % (bias_visinc.euclid_norm()*(epsilonbc/batch_size)) \ + + ", wm: " + '%3.2e' % w_mean.euclid_norm() \ + + ", Dwm: " + '%3.2e' % (w_meaninc.euclid_norm()*(epsilonw_meanc/batch_size)) \ + + ", bm: " + '%3.2e' % bias_mean.euclid_norm() \ + + ", Dbm: " + '%3.2e' % (bias_meaninc.euclid_norm()*(epsilonb_meanc/batch_size)) \ + + ", step: " + '%3.2e' % hmc_step \ + + ", rej: " + '%3.2e' % hmc_ave_rej + sys.stdout.flush() + + def save_stuff(): + VF.copy_to_host() + FH.copy_to_host() + bias_cov.copy_to_host() + w_mean.copy_to_host() + bias_mean.copy_to_host() + bias_vis.copy_to_host() + savemat("ws_temp", { + 'VF':VF.numpy_array, + 'FH':FH.numpy_array, + 'bias_cov': bias_cov.numpy_array, + 'bias_vis': bias_vis.numpy_array, + 'w_mean': w_mean.numpy_array, + 'bias_mean': bias_mean.numpy_array, + 'epoch':epoch}) + + tile(VF.numpy_array.T, 'VF_%000i.png'%epoch) + tile(w_mean.numpy_array.T, 'w_mean_%000i.png'%epoch) + save_imshow(FH.numpy_array, 'FH_%000i.png'%epoch) + + # anneal learning rates + epsilonVFc = epsilonVF/max(1,epoch/20) + epsilonFHc = epsilonFH/max(1,epoch/20) + epsilonbc = epsilonb/max(1,epoch/20) + epsilonw_meanc = epsilonw_mean/max(1,epoch/20) + epsilonb_meanc = epsilonb_mean/max(1,epoch/20) + weightcost = weightcost_final + + if epoch <= startFH: + epsilonFHc = 0 + if epoch <= startwd: + weightcost = 0 + + print "Epoch " + str(epoch + 1), 'num_batches', num_batches + if epoch == 0: + print_stuff() + + for batch in range(num_batches): + + # get current minibatch + data = dev_dat.slice(batch*batch_size,(batch + 1)*batch_size) # DxP (nr dims x nr samples) + + # normalize input data + data.mult(data, target = t6) # DxP + t6.sum(axis = 0, target = lengthsq) # 1xP + lengthsq.mult(1./num_vis) # normalize by number of components (like std) + lengthsq.add(small) # small avoids division by 0 + cmt.sqrt(lengthsq, target = length) + length.reciprocal(target = normcoeff) # 1xP + data.mult_by_row(normcoeff, target = normdata) # normalized data + ## compute positive sample derivatives + # covariance part + cmt.dot(VF.T, normdata, target = feat) # HxP (nr facs x nr samples) + feat.mult(feat, target = featsq) # HxP + cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples) + t1.mult(-0.5) + t1.add_col_vec(bias_cov) # OxP + t1.apply_sigmoid(target = t2) # OxP + cmt.dot(featsq, t2.T, target = FHinc) # HxO + cmt.dot(FH,t2, target = t3) # HxP + t3.mult(feat) + cmt.dot(normdata, t3.T, target = VFinc) # VxH + t2.sum(axis = 1, target = bias_covinc) + bias_covinc.mult(-1) + # visible bias + data.sum(axis = 1, target = bias_visinc) + bias_visinc.mult(-1) + # mean part + cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples) + feat_mean.add_col_vec(bias_mean) # HxP + feat_mean.apply_sigmoid() # HxP + feat_mean.mult(-1) + cmt.dot(data, feat_mean.T, target = w_meaninc) + feat_mean.sum(axis = 1, target = bias_meaninc) + + # HMC sampling: draw an approximate sample from the model + if doPCD == 0: # CD-1 (set negative data to current training samples) + hmc_step, hmc_ave_rej = draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis) + else: # PCD-1 (use previous negative data as starting point for chain) + negdataini.assign(negdata) + hmc_step, hmc_ave_rej = draw_HMC_samples(negdataini,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis) + + # compute derivatives at the negative samples + # normalize input data + negdata.mult(negdata, target = t6) # DxP + t6.sum(axis = 0, target = lengthsq) # 1xP + lengthsq.mult(1./num_vis) # normalize by number of components (like std) + lengthsq.add(small) + cmt.sqrt(lengthsq, target = length) + length.reciprocal(target = normcoeff) # 1xP + negdata.mult_by_row(normcoeff, target = normdata) # normalized data + # covariance part + cmt.dot(VF.T, normdata, target = feat) # HxP + feat.mult(feat, target = featsq) # HxP + cmt.dot(FH.T,featsq, target = t1) # OxP + t1.mult(-0.5) + t1.add_col_vec(bias_cov) # OxP + t1.apply_sigmoid(target = t2) # OxP + FHinc.subtract_dot(featsq, t2.T) # HxO + FHinc.mult(0.5) + cmt.dot(FH,t2, target = t3) # HxP + t3.mult(feat) + VFinc.subtract_dot(normdata, t3.T) # VxH + bias_covinc.add_sums(t2, axis = 1) + # visible bias + bias_visinc.add_sums(negdata, axis = 1) + # mean part + cmt.dot(w_mean.T, negdata, target = feat_mean) # HxP + feat_mean.add_col_vec(bias_mean) # HxP + feat_mean.apply_sigmoid() # HxP + w_meaninc.add_dot(negdata, feat_mean.T) + bias_meaninc.add_sums(feat_mean, axis = 1) + + # update parameters + VFinc.add_mult(VF.sign(), weightcost) # L1 regularization + VF.add_mult(VFinc, -epsilonVFc/batch_size) + # normalize columns of VF: normalize by running average of their norm + VF.mult(VF, target = t8) + t8.sum(axis = 0, target = t10) + cmt.sqrt(t10) + t10.sum(axis=1,target = t5) + t5.copy_to_host() + normVF = .95*normVF + (.05/num_fac) * t5.numpy_array[0,0] # estimate norm + t10.reciprocal() + VF.mult_by_row(t10) + VF.mult(normVF) + bias_cov.add_mult(bias_covinc, -epsilonbc/batch_size) + bias_vis.add_mult(bias_visinc, -epsilonbc/batch_size) + + if epoch > startFH: + FHinc.add_mult(FH.sign(), weightcost) # L1 regularization + FH.add_mult(FHinc, -epsilonFHc/batch_size) # update + # set to 0 negative entries in FH + FH.greater_than(0, target = t9) + FH.mult(t9) + if apply_mask==1: + FH.mult(mask) + # normalize columns of FH: L1 norm set to 1 in each column + FH.sum(axis = 0, target = t11) + t11.reciprocal() + FH.mult_by_row(t11) + w_meaninc.add_mult(w_mean.sign(),weightcost) + w_mean.add_mult(w_meaninc, -epsilonw_meanc/batch_size) + bias_mean.add_mult(bias_meaninc, -epsilonb_meanc/batch_size) + + if verbose == 1: + print_stuff() + # back-up every once in a while + if np.mod(epoch,1) == 0: + save_stuff() + # final back-up + VF.copy_to_host() + FH.copy_to_host() + bias_cov.copy_to_host() + bias_vis.copy_to_host() + w_mean.copy_to_host() + bias_mean.copy_to_host() + savemat("ws_fac" + str(num_fac) + "_cov" + str(num_hid_cov) + "_mean" + str(num_hid_mean), {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch}) + + + +###################################33 +# main +if __name__ == "__main__": + # initialize CUDA + #cmt.cuda_set_device(gpu_lock.obtain_lock_id()) # uncomment if you have a locking system or desire to choose the GPU board number + cmt.cublas_init() + cmt.CUDAMatrix.init_random(1) + train_mcRBM() + cmt.cublas_shutdown() + + + + + + + + diff -r d19e3cb809c1 -r 3977ecd49431 pylearn/shared/layers/sgd.py --- a/pylearn/shared/layers/sgd.py Wed Sep 01 17:58:43 2010 -0400 +++ b/pylearn/shared/layers/sgd.py Wed Sep 01 17:59:11 2010 -0400 @@ -137,3 +137,9 @@ or (self.iter < (self.halflife_iter * self.patience_factor)) self.iter += 1 + def __str__(self): + return ("Stopper{iter=%(iter)s," + "promising=%(promising)s,best_iter=%(best_iter)s,best_value=%(best_value)s" + ",patience=%(patience_factor)s}")%self.__dict__ + +