annotate pylearn/datasets/tagatune.py @ 1492:e7c4d031d333

Fix for Windows paths
author Olivier Delalleau <delallea@iro>
date Tue, 16 Aug 2011 15:44:01 -0400
parents 7d8bb6d087bc
children
rev   line source
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
1 """
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
2 Access the TagATune dataset from
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
3
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
4 http://tagatune.org/Datasets.html
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
5 """
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
6
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
7 from __future__ import absolute_import
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
8
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
9 import os
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
10 import numpy
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
11
690
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
12 import theano
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
13
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
14 from .config import data_root
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
15
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
16 def read_annotations_final(path):
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
17 """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
18
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
19 :param path: an openable string to locate the file tagatune/annotations_final.csv
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
20
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
21 :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
22 attribute names)
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
23
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
24 """
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
25 f = open(path)
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
26 attributes = []
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
27 mp3_paths = []
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
28 clip_ids = []
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
29 for line_idx, line in enumerate(f):
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
30 if line_idx == 0:
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
31 #this is the header line, it contains all the column names
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
32 column_names = [eval(tok) for tok in line[:-2].split('\t')]
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
33 assert len(column_names) == 190
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
34 assert column_names[0] == 'clip_id'
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
35 assert column_names[-1] == 'mp3_path'
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
36 else:
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
37 #strip the leading and trailing '"' symbol from each token
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
38 column_values = [tok[1:-1] for tok in line[:-2].split('\t')]
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
39 assert len(column_values) == 190
690
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
40 clip_ids.append(int(column_values[0]))
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
41 mp3_paths.append(column_values[-1])
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
42 # assert we didn't chop off too many chars
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
43 assert column_values[-1].endswith('.mp3')
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
44 attributes_this_line = column_values[1:-1]
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
45
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
46 # assert that the data is binary
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
47 assert all(c in '01' for c in attributes_this_line)
690
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
48 attributes.append(numpy.asarray([int(c) for c in attributes_this_line],
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
49 dtype='int8'))
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
50
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
51 # assert that we read all the lines of the file
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
52 assert len(clip_ids) == 25863
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
53 assert len(attributes) == 25863
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
54 assert len(mp3_paths) == 25863
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
55
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
56 attribute_names = column_names[1:-1] #all but clip_id and mp3_path
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
57 return clip_ids, attributes, mp3_paths, attribute_names
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
58
690
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
59 def cached_read_annotations_final(path):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
60 if not hasattr(cached_read_annotations_final, 'rval'):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
61 cached_read_annotations_final.rval = {}
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
62 if not path in cached_read_annotations_final.rval:
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
63 cached_read_annotations_final.rval[path] = read_annotations_final(path)
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
64 return cached_read_annotations_final.rval[path]
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
65
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
66 def test_read_annotations_final():
690
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
67 return read_annotations_final(data_root() + '/tagatune/annotations_final.csv')
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
68
690
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
69 class TagatuneExample(theano.Op):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
70 """
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
71 input - index into tagatune database (not clip_id)
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
72 output - clip_id, attributes, path to clip's mp3 file
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
73 """
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
74 def __init__(self, music_dbs='/data/gamme/data/music_dbs'):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
75 self.music_dbs = music_dbs
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
76 annotations_path = music_dbs + '/tagatune/annotations_final.csv'
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
77 self.clip_ids, self.attributes, self.mp3_paths, self.attribute_names =\
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
78 cached_read_annotations_final(annotations_path)
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
79
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
80 n_examples = property(lambda self: len(self.clip_ids))
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
81
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
82 def make_node(self, idx):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
83 _idx = theano.tensor.as_tensor_variable(idx, ndim=0)
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
84 return theano.Apply(self,
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
85 [_idx],
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
86 [theano.tensor.lscalar('clip_id'),
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
87 theano.tensor.bvector('clip_attributes'),
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
88 theano.generic('clip_path')])
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
89 def perform(self, node, (idx,), out_storage):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
90 out_storage[0][0] = self.clip_ids[idx]
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
91 out_storage[1][0] = self.attributes[idx]
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
92 out_storage[2][0] = self.music_dbs + '/tagatune/clips/mp3/' + self.mp3_paths[idx]
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
93
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
94 def grad(self, inputs, output):
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
95 return [None for i in inputs]
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
96
7d8bb6d087bc additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 683
diff changeset
97 #tagatune_example = TagatuneExample() #requires reading a big data file