683
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
1 """
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
2 Access the TagATune dataset from
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
3
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
4 http://tagatune.org/Datasets.html
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
5 """
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
6
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
7 from __future__ import absolute_import
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
8
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
9 import os
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
10 import numpy
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
11
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
12 from config import data_root
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
13
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
14 def read_annotations_final(path):
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
15 """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
16
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
17 :param path: an openable string to locate the file tagatune/annotations_final.csv
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
18
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
19 :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
20 attribute names)
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
21
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
22 """
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
23 f = open(path)
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
24 attributes = []
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
25 mp3_paths = []
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
26 clip_ids = []
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
27 for line_idx, line in enumerate(f):
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
28 if line_idx == 0:
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
29 #this is the header line, it contains all the column names
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
30 column_names = [eval(tok) for tok in line[:-2].split('\t')]
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
31 assert len(column_names) == 190
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
32 assert column_names[0] == 'clip_id'
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
33 assert column_names[-1] == 'mp3_path'
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
34 else:
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
35 #strip the leading and trailing '"' symbol from each token
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
36 column_values = [tok[1:-1] for tok in line[:-2].split('\t')]
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
37 assert len(column_values) == 190
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
38 clip_ids.append(column_values[0])
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
39 mp3_paths.append(column_values[-1])
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
40 # assert we didn't chop off too many chars
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
41 assert column_values[-1].endswith('.mp3')
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
42 attributes_this_line = column_values[1:-1]
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
43
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
44 # assert that the data is binary
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
45 assert all(c in '01' for c in attributes_this_line)
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
46 attributes.append(attributes_this_line)
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
47
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
48 # assert that we read all the lines of the file
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
49 assert len(clip_ids) == 25863
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
50 assert len(attributes) == 25863
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
51 assert len(mp3_paths) == 25863
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
52
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
53 attribute_names = column_names[1:-1] #all but clip_id and mp3_path
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
54 return clip_ids, attributes, mp3_paths, attribute_names
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
55
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
56 def test_read_annotations_final():
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
57 return read_annotations_final(data_root() +'/tagatune/annotations_final.csv')
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
58
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
59 if __name__ == '__main__':
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
60 print 'starting'
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
61 test_read_annotations_final()
|
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
62 print 'done'
|