# HG changeset patch # User James Bergstra # Date 1240260783 14400 # Node ID 0929be7f9e430faf200cf5af5d21699b5825ae05 # Parent be6639fccecc7eec5f0f020f5ca8729dc6239af7 adding tagatune diff -r be6639fccecc -r 0929be7f9e43 pylearn/datasets/tagatune.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/tagatune.py Mon Apr 20 16:53:03 2009 -0400 @@ -0,0 +1,62 @@ +""" +Access the TagATune dataset from + +http://tagatune.org/Datasets.html +""" + +from __future__ import absolute_import + +import os +import numpy + +from config import data_root + +def read_annotations_final(path): + """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file + + :param path: an openable string to locate the file tagatune/annotations_final.csv + + :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of + attribute names) + + """ + f = open(path) + attributes = [] + mp3_paths = [] + clip_ids = [] + for line_idx, line in enumerate(f): + if line_idx == 0: + #this is the header line, it contains all the column names + column_names = [eval(tok) for tok in line[:-2].split('\t')] + assert len(column_names) == 190 + assert column_names[0] == 'clip_id' + assert column_names[-1] == 'mp3_path' + else: + #strip the leading and trailing '"' symbol from each token + column_values = [tok[1:-1] for tok in line[:-2].split('\t')] + assert len(column_values) == 190 + clip_ids.append(column_values[0]) + mp3_paths.append(column_values[-1]) + # assert we didn't chop off too many chars + assert column_values[-1].endswith('.mp3') + attributes_this_line = column_values[1:-1] + + # assert that the data is binary + assert all(c in '01' for c in attributes_this_line) + attributes.append(attributes_this_line) + + # assert that we read all the lines of the file + assert len(clip_ids) == 25863 + assert len(attributes) == 25863 + assert len(mp3_paths) == 25863 + + attribute_names = column_names[1:-1] #all but clip_id and mp3_path + return clip_ids, attributes, mp3_paths, attribute_names + +def test_read_annotations_final(): + return read_annotations_final(data_root() +'/tagatune/annotations_final.csv') + +if __name__ == '__main__': + print 'starting' + test_read_annotations_final() + print 'done'