Mercurial > pylearn
changeset 692:5ca1a8e859db
merge
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Thu, 14 May 2009 17:00:22 -0400 |
parents | e69249897f89 (diff) 0457dfa6fcad (current diff) |
children | ee7026de9681 |
files | |
diffstat | 6 files changed, 300 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/pylearn/algorithms/exponential_mean.py Tue May 12 15:34:38 2009 -0400 +++ b/pylearn/algorithms/exponential_mean.py Thu May 14 17:00:22 2009 -0400 @@ -141,3 +141,25 @@ obj.mean.initialize() obj.mean_sqr.initialize() +class DynamicNormalizer(theano.Module): + """ + Normalizes `input` using geometric-decaying estimates of the mean and variance. The + `output` should mean near zero, and variance near 1. + """ + def __init__(self, input, input_shape, max_denom=100, eps=1.0e-8): + super(DynamicNormalizer, self).__init__() + self.input = input + self.d_mean = exp_mean(input, input_shape, max_denom=max_denom) + self.d_var = exp_var(input, input_shape, max_denom=max_denom) + self.output = (input - self.d_mean.curval) / theano.tensor.sqrt(self.d_var.curval+eps) + + def updates(self): + rval = {} + rval.update(self.d_mean.updates()) + rval.update(self.d_var.updates()) + return rval + + def _instance_initialize(self, obj): + obj.d_mean.initialize() + obj.d_var.initialize() +
--- a/pylearn/algorithms/tests/test_exponential_mean.py Tue May 12 15:34:38 2009 -0400 +++ b/pylearn/algorithms/tests/test_exponential_mean.py Thu May 14 17:00:22 2009 -0400 @@ -48,3 +48,50 @@ assert not numpy.allclose(dmean, nmean) assert i > rows_to_test + +def test_dynamic_normalizer(): + x = theano.tensor.dvector() + + rows_to_test = 100 + cols=2 + + D = exponential_mean.DynamicNormalizer(x, (cols,), rows_to_test) + + M = theano.Module() + M.dn = D + M.dn_mean = exponential_mean.exp_mean(D.output, (cols,), 50) + M.dn_var = exponential_mean.exp_var(D.output, (cols,), 50) + M.x_mean = exponential_mean.exp_mean(x, (cols,), 10) + + updates = D.updates() + #print len(updates) + updates.update(M.dn_mean.updates()) + #print len(updates) + updates.update(M.dn_var.updates()) + #print len(updates) + updates.update(M.x_mean.updates()) + #print len(updates) + + + + M.f = theano.Method([x], [D.output, M.dn_mean.curval, M.dn_var.curval, M.x_mean.curval] , updates) + + m = M.make() + m.dn.initialize() + m.dn_mean.initialize() + m.dn_var.initialize() + m.x_mean.initialize() + + + rng = numpy.random.RandomState(3284) + xval = rng.rand(rows_to_test+100,cols) + + for i, xrow in enumerate(xval): + n_x = m.f(xrow) + + #print n_x + + assert numpy.all(numpy.abs(n_x[1]) < 0.15) # the means should be close to 0 + assert numpy.all(numpy.abs(n_x[2]-1) < 0.07) # the variance should be close to 1.0 + assert i > rows_to_test +
--- a/pylearn/datasets/tagatune.py Tue May 12 15:34:38 2009 -0400 +++ b/pylearn/datasets/tagatune.py Thu May 14 17:00:22 2009 -0400 @@ -9,7 +9,9 @@ import os import numpy -from config import data_root +import theano + +from .config import data_root def read_annotations_final(path): """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file @@ -35,7 +37,7 @@ #strip the leading and trailing '"' symbol from each token column_values = [tok[1:-1] for tok in line[:-2].split('\t')] assert len(column_values) == 190 - clip_ids.append(column_values[0]) + clip_ids.append(int(column_values[0])) mp3_paths.append(column_values[-1]) # assert we didn't chop off too many chars assert column_values[-1].endswith('.mp3') @@ -43,7 +45,8 @@ # assert that the data is binary assert all(c in '01' for c in attributes_this_line) - attributes.append(attributes_this_line) + attributes.append(numpy.asarray([int(c) for c in attributes_this_line], + dtype='int8')) # assert that we read all the lines of the file assert len(clip_ids) == 25863 @@ -53,10 +56,42 @@ attribute_names = column_names[1:-1] #all but clip_id and mp3_path return clip_ids, attributes, mp3_paths, attribute_names +def cached_read_annotations_final(path): + if not hasattr(cached_read_annotations_final, 'rval'): + cached_read_annotations_final.rval = {} + if not path in cached_read_annotations_final.rval: + cached_read_annotations_final.rval[path] = read_annotations_final(path) + return cached_read_annotations_final.rval[path] + def test_read_annotations_final(): - return read_annotations_final(data_root() +'/tagatune/annotations_final.csv') + return read_annotations_final(data_root() + '/tagatune/annotations_final.csv') -if __name__ == '__main__': - print 'starting' - test_read_annotations_final() - print 'done' +class TagatuneExample(theano.Op): + """ + input - index into tagatune database (not clip_id) + output - clip_id, attributes, path to clip's mp3 file + """ + def __init__(self, music_dbs='/data/gamme/data/music_dbs'): + self.music_dbs = music_dbs + annotations_path = music_dbs + '/tagatune/annotations_final.csv' + self.clip_ids, self.attributes, self.mp3_paths, self.attribute_names =\ + cached_read_annotations_final(annotations_path) + + n_examples = property(lambda self: len(self.clip_ids)) + + def make_node(self, idx): + _idx = theano.tensor.as_tensor_variable(idx, ndim=0) + return theano.Apply(self, + [_idx], + [theano.tensor.lscalar('clip_id'), + theano.tensor.bvector('clip_attributes'), + theano.generic('clip_path')]) + def perform(self, node, (idx,), out_storage): + out_storage[0][0] = self.clip_ids[idx] + out_storage[1][0] = self.attributes[idx] + out_storage[2][0] = self.music_dbs + '/tagatune/clips/mp3/' + self.mp3_paths[idx] + + def grad(self, inputs, output): + return [None for i in inputs] + +#tagatune_example = TagatuneExample() #requires reading a big data file
--- a/pylearn/datasets/tzanetakis.py Tue May 12 15:34:38 2009 -0400 +++ b/pylearn/datasets/tzanetakis.py Thu May 14 17:00:22 2009 -0400 @@ -88,8 +88,6 @@ assert len(path) == 1000 return path, label - nclasses = 10 - class_idx_dict = dict(blues=numpy.asarray(0), classical=1, country=2, @@ -108,8 +106,9 @@ for i, c in enumerate(classes): self.class_idx_dict[c] = numpy.asarray(i, dtype='int64') - def __len__(self): - return len(self.path) + n_examples = property(lambda self: len(self.path)) + nclasses = property(lambda self: 10) + def make_node(self, idx): idx_ = theano.tensor.as_tensor_variable(idx) @@ -127,5 +126,5 @@ def grad(self, inputs, g_output): return [None for i in inputs] -tzanetakis_example = TzanetakisExample() +#tzanetakis_example = TzanetakisExample() #requires reading a data file
--- a/pylearn/external/wrap_libsvm.py Tue May 12 15:34:38 2009 -0400 +++ b/pylearn/external/wrap_libsvm.py Thu May 14 17:00:22 2009 -0400 @@ -7,16 +7,75 @@ # # This module uses a specific convention for libsvm's installation. # I base this on installing libsvm-2.88. -# To install libsvm's python module, do three things: +# To install libsvm's python module, do the following: # 1. Build libsvm (run make in both the root dir and the python subdir). # 2. touch a '__init__.py' file in the python subdir # 3. add a symbolic link to a PYTHONPATH location that looks like this: # libsvm -> <your root path>/libsvm-2.88/python/ +# 4. modify the svm_model class in python/svm.py to inherit from object # # That is the sort of thing that this module expects from 'import libsvm' import libsvm +class svm_model(libsvm.svm_model): + """ + This class is a picklable drop-in replacement for libsvm.svm_model. + """ + def __getstate__(self): + return PicklableSVM.svm_to_str(self) + + def __setstate__(self, svm_str): + PicklableSVM.str_to_svm(svm_str, self=self) + + @staticmethod + def str_to_svm(s, self=None): + fname = tempfile.mktemp() + f = open(fname,'w') + f.write(s) + f.close() + rval = self + try: + if self: + self.__init__(fname) + else: + rval = libsvm.svm_model(fname) + finally: + os.remove(fname) + return rval + + @staticmethod + def svm_to_str(svm): + fname = tempfile.mktemp() + svm.save(fname) + rval = open(fname, 'r').read() + os.remove(fname) + return rval + + def predict(self, x): + if type(x) != numpy.ndarray: + raise TypeError(x) + if x.ndim != 1: + raise TypeError(x) + return libsvm.svm_model.predict(self, numpy.asarray(x, dtype='float64')) + + def predict_probability(self, x): + if x.ndim != 1: + raise TypeError(x) + return libsvm.svm_model.predict_probability(self, numpy.asarray(x, dtype='float64')) + +svm_problem = libsvm.svm_problem +svm_parameter = libsvm.svm_parameter +RBF = libsvm.svm_RBF + + +#################################### +# Extra stuff that is less essential +# +# TODO: Move stuff below to a file +# in algorithms +#################################### + def score_01(x, y, model): assert len(x) == len(y) size = len(x) @@ -42,8 +101,7 @@ for k, v in kwargs: setattr(self, k, type(getattr(self, k))(v)) - -def dbdict_run_svm_experiment(state, channel=lambda *args, **kwargs:None): +def state_run_svm_experiment(state, channel=lambda *args, **kwargs:None): """Parameters are described in state, and returned in state. :param state: object instance to store parameters and return values @@ -98,3 +156,57 @@ state_run_svm_experiment(state=kwargs) return kwargs +def train_rbf_model(train_X, train_Y, C, gamma): + param = libsvm.svm_parameter(C=C, kernel_type=libsvm.RBF, gamma=gamma) + problem = libsvm.svm_problem(train_Y, train_X) + model libsvm.svm_model(problem, param) + + #save_filename = state.save_filename + #model.save(save_filename) + + +def jobman_train_model(state, channel): + """ + + According to the given validation set, + What is the best libsvm parameter setting to train on? + """ + (train_X, train_Y) = jobman.tools.make(state.train_set) + (valid_X, valid_Y) = jobman.tools.make(state.valid_set) + + C_grid = [1,2,3] + gamma_grid = [0.1, 1, 10] + + grid = [dict( + train_set=None, + svm_param=dict(kernel='RBF', C=C, gamma=g), + save_filename='model_RBF_C%f_G%f.libsvm') + for C in C_grid, + for g in gamma_grid] + + # will return quickly if jobs have already run + # and the rootpath is populated with results + grid = jobman.map( + jobman_train_model_given_all_params, + grid, + path=jobman.rootpath(state)+'/gridmap', + cleanup=False) + + # evaluate all these sub_state models on our validation_set + valid_perf = [] + for sub_state in grid: + # create a file in this state-space called model.tmp + # with the same contents as the + # save_filename file in the sub_state + jobman.link('model.tmp', jobman.rootpath(sub_state)+'/'+sub_state.save_filename) + model = svm.model('model.tmp') + valid_perf.append((score_01(valid_X, valid_Y, model), sub_state)) + jobman.unlink('model.tmp') + + # calculate the return value + valid_perf.sort() #lowest first + state.lowest_valid_err = valid_perf[0][0] + state.lowest_valid_svm_param = valid_perf[0][1].svm_param + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/audio.py Thu May 14 17:00:22 2009 -0400 @@ -0,0 +1,69 @@ + +import numpy +import theano + +from wavread import WavRead, wav_read_int16, wav_read_double + +try: #define audioread and company only if pygmy.audio can be imported + import pygmy.audio + + class AudioRead(theano.Op): + #TODO: add the samplerate as an output + """Read a wave file or mp3 + + input - filename + output - the contents of the audiofile in pcm format, and the samplerate + + """ + + #arguments to pygmy.audio.audioread + _audioread_attrs=('mono', 'tlast', 'fs_target', 'stripzeros', 'stats_only', 'decoder') + + mono = False + tlast=-1 + fs_target=-1 + stripzeros='none' + stats_only=False + decoder = 'madplay' + + def __init__(self, **kwargs): + for kw in kwargs: + if not kw in self._audioread_attrs: + raise TypeError('unrecognized keyword argument', kw) + setattr(self, kw, kwargs[kw]) + def __eq__(self, other): + return (type(self) == type(other)) and \ + all(getattr(self, a) == getattr(other,a) for a in self._audioread_attrs) + def __hash__(self): + return reduce( + lambda a,b: a^b, + [getattr(self, a) for a in self._audioread_attrs], + initial=hash(type(self))) + def make_node(self, path): + out_type = theano.tensor.dvector if self.mono else theano.tensor.dmatrix + return theano.Apply(self, [path], [out_type(), theano.tensor.dscalar()]) + def perform(self, node, (path,), (data_storage, sr_storage)): + data, sr, dz = pygmy.audio.audioread(path, + mono=self.mono, + tlast=self.tlast, + fs_target=self.fs_target, + stripzeros=self.stripzeros, + stats_only=self.stats_only, + decoder=self.decoder) + + assert isinstance(data, numpy.ndarray) + assert data.ndim == (1 if self.mono else 2) + assert data.dtype == numpy.float64 + data_storage[0] = data + + sr_storage[0] = numpy.asarray(sr,dtype='float64') + assert sr_storage[0].ndim==0 + + def grad(self, inputs, g_output): + return [None for i in inputs] + + audioread = AudioRead() + audioread_mono = AudioRead(mono=True) +except ImportError: + pass +