Mercurial > pylearn
annotate pylearn/datasets/tzanetakis.py @ 1492:e7c4d031d333
Fix for Windows paths
author | Olivier Delalleau <delallea@iro> |
---|---|
date | Tue, 16 Aug 2011 15:44:01 -0400 |
parents | 651eb6506d91 |
children |
rev | line source |
---|---|
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
1 """ |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
2 Load Tzanetakis' genre-classification dataset. |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
3 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
4 """ |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
5 from __future__ import absolute_import |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
6 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
7 import os |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
8 import numpy |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
9 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
10 from ..io.amat import AMat |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
11 from .config import data_root |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
12 from .dataset import dataset_factory, Dataset |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
13 |
640
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
14 def centre_data(x, inplace=False): |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
15 rval = x if inplace else x.copy() |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
16 #zero-mean |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
17 rval -= numpy.mean(rval, axis=0) |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
18 #unit-variance |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
19 rval *= 1.0 / (1.0e-6 + numpy.std(rval, axis=0)) |
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
20 return rval |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
21 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
22 def mfcc16(segments_per_song = 1, include_covariance = True, random_split = 0, |
640
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
23 ntrain = 700, nvalid = 100, ntest = 200, |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
24 normalize=True): |
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
25 if segments_per_song != 1: |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
26 raise NotImplementedError() |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
27 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
28 path = os.path.join(data_root(), 'tzanetakis','feat_mfcc16_540_1.stat.amat') |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
29 dat = AMat(path=path) |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
30 all_input = dat.input |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
31 assert all_input.shape == (1000 * segments_per_song, 152) |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
32 all_targ = numpy.tile(numpy.arange(10).reshape(10,1), 100 * segments_per_song)\ |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
33 .reshape(1000 * segments_per_song) |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
34 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
35 if not include_covariance: |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
36 all_input = all_input[:,0:16] |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
37 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
38 #shuffle the data according to the random split |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
39 assert all_input.shape[0] == all_targ.shape[0] |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
40 seed = random_split + 1 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
41 numpy.random.RandomState(seed).shuffle(all_input) |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
42 numpy.random.RandomState(seed).shuffle(all_targ) |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
43 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
44 #construct a dataset to return |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
45 rval = Dataset() |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
46 |
640
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
47 def prepx(x): |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
48 return centre_data(x, inplace=True) if normalize else x |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
49 |
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
50 rval.train = Dataset.Obj(x=prepx(all_input[0:ntrain]), |
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
51 y=all_targ[0:ntrain]) |
640
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
52 rval.valid = Dataset.Obj(x=prepx(all_input[ntrain:ntrain+nvalid]), |
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
53 y=all_targ[ntrain:ntrain+nvalid]) |
640
af14b1f32882
revised tzanetakis, added data centering
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
605
diff
changeset
|
54 rval.test = Dataset.Obj(x=prepx(all_input[ntrain+nvalid:ntrain+nvalid+ntest]), |
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
55 y=all_targ[ntrain+nvalid:ntrain+nvalid+ntest]) |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
56 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
57 rval.n_classes = 10 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
58 |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
59 return rval |
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
60 |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
61 import theano |
605
20953adfdef8
initial tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
62 |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
63 class TzanetakisExample(theano.Op): |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
64 """Return the i'th file, label pair from the Tzanetakis dataset.""" |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
65 @staticmethod |
682
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
66 def read_tracklist(alt_path_root=None): |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
67 """Read the tzanetakis dataset file |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
68 :rtype: (list, list) |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
69 :returns: paths, labels |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
70 """ |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
71 tracklist = open(data_root() + '/tzanetakis/tracklist.txt') |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
72 path = [] |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
73 label = [] |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
74 for line in tracklist: |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
75 toks = line.split() |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
76 try: |
682
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
77 if alt_path_root is None: |
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
78 path.append(toks[0]) |
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
79 else: |
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
80 line_path = toks[0] |
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
81 file_name = line_path.split('/')[-1] |
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
82 path.append(alt_path_root + '/' + file_name) |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
83 label.append(toks[1]) |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
84 except: |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
85 print 'BAD LINE IN TZANETAKIS TRACKLIST' |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
86 print line, toks |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
87 raise |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
88 assert len(path) == 1000 |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
89 return path, label |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
90 |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
91 class_idx_dict = dict(blues=numpy.asarray(0), |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
92 classical=1, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
93 country=2, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
94 disco=3, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
95 hiphop=4, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
96 jazz=5, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
97 metal=6, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
98 pop=7, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
99 reggae=8, |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
100 rock=9) |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
101 |
682
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
102 def __init__(self, alt_path_root=None): |
be6639fccecc
added option for custom path in tzanetakis
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
674
diff
changeset
|
103 self.path, self.label = self.read_tracklist(alt_path_root) |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
104 self.class_idx_dict = {} |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
105 classes = ('blues classical country disco hiphop jazz metal pop reggae rock').split() |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
106 for i, c in enumerate(classes): |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
107 self.class_idx_dict[c] = numpy.asarray(i, dtype='int64') |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
108 |
689
651eb6506d91
do not read data file on import
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
682
diff
changeset
|
109 n_examples = property(lambda self: len(self.path)) |
651eb6506d91
do not read data file on import
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
682
diff
changeset
|
110 nclasses = property(lambda self: 10) |
651eb6506d91
do not read data file on import
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
682
diff
changeset
|
111 |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
112 |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
113 def make_node(self, idx): |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
114 idx_ = theano.tensor.as_tensor_variable(idx) |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
115 if idx_.type not in theano.tensor.int_types: |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
116 raise TypeError(idx) |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
117 return theano.Apply(self, |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
118 [idx_], |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
119 [theano.generic('tzanetakis_path'), |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
120 theano.tensor.lscalar('tzanetakis_label')]) |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
121 |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
122 def perform(self, node, (idx,), (path, label)): |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
123 path[0] = self.path[idx] |
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
124 label[0] = self.class_idx_dict[self.label[idx]] |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
125 |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
126 def grad(self, inputs, g_output): |
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
127 return [None for i in inputs] |
674
f3b7d6956209
changes to tzanetakis and wavread
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
671
diff
changeset
|
128 |
689
651eb6506d91
do not read data file on import
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
682
diff
changeset
|
129 #tzanetakis_example = TzanetakisExample() #requires reading a data file |
671
9e62fd6b6677
adding wavread and tzanetakis dataset
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
640
diff
changeset
|
130 |