changeset 683:0929be7f9e43

adding tagatune
author James Bergstra <bergstrj@iro.umontreal.ca>
date Mon, 20 Apr 2009 16:53:03 -0400
parents be6639fccecc
children e7c990d0433b 4b5e0b5a11e1
files pylearn/datasets/tagatune.py
diffstat 1 files changed, 62 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/tagatune.py	Mon Apr 20 16:53:03 2009 -0400
@@ -0,0 +1,62 @@
+"""
+Access the TagATune dataset from
+
+http://tagatune.org/Datasets.html
+"""
+
+from __future__ import absolute_import
+
+import os
+import numpy
+
+from config import data_root
+
+def read_annotations_final(path):
+    """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file
+    
+    :param path: an openable string to locate the file tagatune/annotations_final.csv
+
+    :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of
+    attribute names)
+
+    """
+    f = open(path)
+    attributes = []
+    mp3_paths = []
+    clip_ids = []
+    for line_idx, line in enumerate(f):
+        if line_idx == 0:
+            #this is the header line, it contains all the column names
+            column_names = [eval(tok) for tok in line[:-2].split('\t')]
+            assert len(column_names) == 190
+            assert column_names[0] == 'clip_id'
+            assert column_names[-1] == 'mp3_path'
+        else:
+            #strip the leading and trailing '"' symbol from each token
+            column_values = [tok[1:-1] for tok in line[:-2].split('\t')]
+            assert len(column_values) == 190
+            clip_ids.append(column_values[0])
+            mp3_paths.append(column_values[-1])
+            # assert we didn't chop off too many chars
+            assert column_values[-1].endswith('.mp3')
+            attributes_this_line = column_values[1:-1]
+
+            # assert that the data is binary
+            assert all(c in '01' for c in attributes_this_line)
+            attributes.append(attributes_this_line)
+
+    # assert that we read all the lines of the file
+    assert len(clip_ids) == 25863
+    assert len(attributes) == 25863
+    assert len(mp3_paths) == 25863
+
+    attribute_names = column_names[1:-1] #all but clip_id and mp3_path
+    return clip_ids, attributes, mp3_paths, attribute_names
+
+def test_read_annotations_final():
+    return read_annotations_final(data_root() +'/tagatune/annotations_final.csv')
+
+if __name__ == '__main__':
+    print 'starting'
+    test_read_annotations_final()
+    print 'done'