annotate pylearn/datasets/tagatune.py @ 683:0929be7f9e43

adding tagatune
author James Bergstra <bergstrj@iro.umontreal.ca>
date Mon, 20 Apr 2009 16:53:03 -0400
parents
children 7d8bb6d087bc
rev   line source
683
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
1 """
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
2 Access the TagATune dataset from
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
3
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
4 http://tagatune.org/Datasets.html
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
5 """
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
6
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
7 from __future__ import absolute_import
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
8
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
9 import os
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
10 import numpy
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
11
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
12 from config import data_root
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
13
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
14 def read_annotations_final(path):
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
15 """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
16
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
17 :param path: an openable string to locate the file tagatune/annotations_final.csv
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
18
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
19 :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
20 attribute names)
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
21
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
22 """
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
23 f = open(path)
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
24 attributes = []
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
25 mp3_paths = []
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
26 clip_ids = []
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
27 for line_idx, line in enumerate(f):
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
28 if line_idx == 0:
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
29 #this is the header line, it contains all the column names
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
30 column_names = [eval(tok) for tok in line[:-2].split('\t')]
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
31 assert len(column_names) == 190
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
32 assert column_names[0] == 'clip_id'
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
33 assert column_names[-1] == 'mp3_path'
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
34 else:
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
35 #strip the leading and trailing '"' symbol from each token
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
36 column_values = [tok[1:-1] for tok in line[:-2].split('\t')]
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
37 assert len(column_values) == 190
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
38 clip_ids.append(column_values[0])
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
39 mp3_paths.append(column_values[-1])
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
40 # assert we didn't chop off too many chars
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
41 assert column_values[-1].endswith('.mp3')
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
42 attributes_this_line = column_values[1:-1]
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
43
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
44 # assert that the data is binary
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
45 assert all(c in '01' for c in attributes_this_line)
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
46 attributes.append(attributes_this_line)
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
47
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
48 # assert that we read all the lines of the file
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
49 assert len(clip_ids) == 25863
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
50 assert len(attributes) == 25863
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
51 assert len(mp3_paths) == 25863
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
52
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
53 attribute_names = column_names[1:-1] #all but clip_id and mp3_path
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
54 return clip_ids, attributes, mp3_paths, attribute_names
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
55
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
56 def test_read_annotations_final():
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
57 return read_annotations_final(data_root() +'/tagatune/annotations_final.csv')
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
58
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
59 if __name__ == '__main__':
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
60 print 'starting'
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
61 test_read_annotations_final()
0929be7f9e43 adding tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff changeset
62 print 'done'