Mercurial > pylearn
annotate pylearn/datasets/tagatune.py @ 1492:e7c4d031d333
Fix for Windows paths
author | Olivier Delalleau <delallea@iro> |
---|---|
date | Tue, 16 Aug 2011 15:44:01 -0400 |
parents | 7d8bb6d087bc |
children |
rev | line source |
---|---|
683 | 1 """ |
2 Access the TagATune dataset from | |
3 | |
4 http://tagatune.org/Datasets.html | |
5 """ | |
6 | |
7 from __future__ import absolute_import | |
8 | |
9 import os | |
10 import numpy | |
11 | |
690
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
12 import theano |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
13 |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
14 from .config import data_root |
683 | 15 |
16 def read_annotations_final(path): | |
17 """Return a parsed (column-wise) representation of the tagatune/annotations_final.csv file | |
18 | |
19 :param path: an openable string to locate the file tagatune/annotations_final.csv | |
20 | |
21 :returns: 4-tuple (list of clip_ids, list of attribute lists, list of mp3 paths, list of | |
22 attribute names) | |
23 | |
24 """ | |
25 f = open(path) | |
26 attributes = [] | |
27 mp3_paths = [] | |
28 clip_ids = [] | |
29 for line_idx, line in enumerate(f): | |
30 if line_idx == 0: | |
31 #this is the header line, it contains all the column names | |
32 column_names = [eval(tok) for tok in line[:-2].split('\t')] | |
33 assert len(column_names) == 190 | |
34 assert column_names[0] == 'clip_id' | |
35 assert column_names[-1] == 'mp3_path' | |
36 else: | |
37 #strip the leading and trailing '"' symbol from each token | |
38 column_values = [tok[1:-1] for tok in line[:-2].split('\t')] | |
39 assert len(column_values) == 190 | |
690
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
40 clip_ids.append(int(column_values[0])) |
683 | 41 mp3_paths.append(column_values[-1]) |
42 # assert we didn't chop off too many chars | |
43 assert column_values[-1].endswith('.mp3') | |
44 attributes_this_line = column_values[1:-1] | |
45 | |
46 # assert that the data is binary | |
47 assert all(c in '01' for c in attributes_this_line) | |
690
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
48 attributes.append(numpy.asarray([int(c) for c in attributes_this_line], |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
49 dtype='int8')) |
683 | 50 |
51 # assert that we read all the lines of the file | |
52 assert len(clip_ids) == 25863 | |
53 assert len(attributes) == 25863 | |
54 assert len(mp3_paths) == 25863 | |
55 | |
56 attribute_names = column_names[1:-1] #all but clip_id and mp3_path | |
57 return clip_ids, attributes, mp3_paths, attribute_names | |
58 | |
690
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
59 def cached_read_annotations_final(path): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
60 if not hasattr(cached_read_annotations_final, 'rval'): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
61 cached_read_annotations_final.rval = {} |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
62 if not path in cached_read_annotations_final.rval: |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
63 cached_read_annotations_final.rval[path] = read_annotations_final(path) |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
64 return cached_read_annotations_final.rval[path] |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
65 |
683 | 66 def test_read_annotations_final(): |
690
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
67 return read_annotations_final(data_root() + '/tagatune/annotations_final.csv') |
683 | 68 |
690
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
69 class TagatuneExample(theano.Op): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
70 """ |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
71 input - index into tagatune database (not clip_id) |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
72 output - clip_id, attributes, path to clip's mp3 file |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
73 """ |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
74 def __init__(self, music_dbs='/data/gamme/data/music_dbs'): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
75 self.music_dbs = music_dbs |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
76 annotations_path = music_dbs + '/tagatune/annotations_final.csv' |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
77 self.clip_ids, self.attributes, self.mp3_paths, self.attribute_names =\ |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
78 cached_read_annotations_final(annotations_path) |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
79 |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
80 n_examples = property(lambda self: len(self.clip_ids)) |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
81 |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
82 def make_node(self, idx): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
83 _idx = theano.tensor.as_tensor_variable(idx, ndim=0) |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
84 return theano.Apply(self, |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
85 [_idx], |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
86 [theano.tensor.lscalar('clip_id'), |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
87 theano.tensor.bvector('clip_attributes'), |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
88 theano.generic('clip_path')]) |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
89 def perform(self, node, (idx,), out_storage): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
90 out_storage[0][0] = self.clip_ids[idx] |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
91 out_storage[1][0] = self.attributes[idx] |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
92 out_storage[2][0] = self.music_dbs + '/tagatune/clips/mp3/' + self.mp3_paths[idx] |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
93 |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
94 def grad(self, inputs, output): |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
95 return [None for i in inputs] |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
96 |
7d8bb6d087bc
additions to datasets/tagatune
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
683
diff
changeset
|
97 #tagatune_example = TagatuneExample() #requires reading a big data file |