Mercurial > pylearn
changeset 683:0929be7f9e43
adding tagatune
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Mon, 20 Apr 2009 16:53:03 -0400 |
parents | be6639fccecc |
children | e7c990d0433b 4b5e0b5a11e1 |
files | pylearn/datasets/tagatune.py |
diffstat | 1 files changed, 62 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/tagatune.py Mon Apr 20 16:53:03 2009 -0400 @@ -0,0 +1,62 @@ +""" +Access the TagATune dataset from + +http://tagatune.org/Datasets.html +""" + +from __future__ import absolute_import + +import os +import numpy + +from config import data_root + +def read_annotations_final(path): + """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file + + :param path: an openable string to locate the file tagatune/annotations_final.csv + + :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of + attribute names) + + """ + f = open(path) + attributes = [] + mp3_paths = [] + clip_ids = [] + for line_idx, line in enumerate(f): + if line_idx == 0: + #this is the header line, it contains all the column names + column_names = [eval(tok) for tok in line[:-2].split('\t')] + assert len(column_names) == 190 + assert column_names[0] == 'clip_id' + assert column_names[-1] == 'mp3_path' + else: + #strip the leading and trailing '"' symbol from each token + column_values = [tok[1:-1] for tok in line[:-2].split('\t')] + assert len(column_values) == 190 + clip_ids.append(column_values[0]) + mp3_paths.append(column_values[-1]) + # assert we didn't chop off too many chars + assert column_values[-1].endswith('.mp3') + attributes_this_line = column_values[1:-1] + + # assert that the data is binary + assert all(c in '01' for c in attributes_this_line) + attributes.append(attributes_this_line) + + # assert that we read all the lines of the file + assert len(clip_ids) == 25863 + assert len(attributes) == 25863 + assert len(mp3_paths) == 25863 + + attribute_names = column_names[1:-1] #all but clip_id and mp3_path + return clip_ids, attributes, mp3_paths, attribute_names + +def test_read_annotations_final(): + return read_annotations_final(data_root() +'/tagatune/annotations_final.csv') + +if __name__ == '__main__': + print 'starting' + test_read_annotations_final() + print 'done'