changeset 920:a5c33f01c9a4

removed bad tracks from MajorMiner dataset
author James Bergstra <bergstrj@iro.umontreal.ca>
date Sat, 20 Mar 2010 15:19:43 -0400
parents 3901d06e2d96
children 3476044d7326
files pylearn/datasets/majorminer.py
diffstat 1 files changed, 81 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/majorminer.py	Sat Mar 20 15:18:54 2010 -0400
+++ b/pylearn/datasets/majorminer.py	Sat Mar 20 15:19:43 2010 -0400
@@ -2,7 +2,7 @@
 Load the MajorMiner dataset
 """
 
-import logging, os
+import logging, os,sys
 from .config import data_root
 _logger = logging.getLogger('pylearn.datasets.majorminer')
 
@@ -37,6 +37,82 @@
 
     return tag_count_track
 
+try:
+    import mad
+except ImportError:
+    pass
+
+def remove_bad_tracks(three_col, min_seconds=8):
+    """Heuristically filter the three_col data to contain only valid tracks
+    """
+    bad_tracks = set()
+    all_tracks = set()
+
+    silent_tracks = []
+    missing_in_action = []
+    too_short = []
+
+    try:
+        _file = mad.MadFile
+        test_len = True
+    except:
+        _file = file
+        test_len = False
+
+
+    for tag, count, track in three_col:
+        if track in all_tracks:
+            continue
+        all_tracks.add(track)
+        if tag in set(['silence', 'end', 'nothing']):
+            bad_tracks.add(track)
+            silent_tracks.append(track)
+            _logger.debug("silent file: %s" % track)
+            continue
+
+        try:
+            t = _file(track)
+        except IOError:
+            bad_tracks.add(track)
+            missing_in_action.append(track)
+            _logger.debug("missing file: %s"% track)
+            # it is normal to have 2
+            #if len(missing_in_action) > 5:
+                #raise Exception('Too many missing files:', missing_in_action)
+            continue
+
+        if test_len and t.total_time() < min_seconds*1000:
+            # too short
+            bad_tracks.add(track)
+            _logger.debug("short file: %f %s" %(t.total_time(), track))
+            too_short.append((track, t.total_time()))
+            # it is normal to have maybe 10?
+            #if len(too_short) > 40:
+                #raise Exception('Too many short files:', too_short)
+            continue
+
+    if silent_tracks:
+        _logger.warning("removed %i silent files"% len(silent_tracks))
+
+    if missing_in_action:
+        _logger.warning("missing %i files"% len(missing_in_action))
+
+    if too_short:
+        _logger.warning("discarded %i files less than %f seconds long"%(
+            len(too_short), min_seconds))
+
+    _logger.info("kept %i of %i tracks"% (len(all_tracks)-len(bad_tracks),
+        len(all_tracks)))
+
+    # return a cleaned three_column list
+    rval = []
+    for tag, count, track in three_col:
+        if track not in bad_tracks:
+            rval.append((tag, count, track))
+    return rval
+
+
+
 def list_tracks(three_col):
     tracks = list(set(tup[2] for tup in three_col))
     tracks.sort()
@@ -64,8 +140,11 @@
 
 
 class Meta(object):
-    def __init__(self, tagfile=None, trackroot=None, expected_tagfile_len=51556):
+    def __init__(self, tagfile=None, trackroot=None, expected_tagfile_len=51556,
+            filter_broken=True):
         self.three_column = three_column(tagfile, trackroot, expected_tagfile_len)
+        if filter_broken:
+            self.three_column = remove_bad_tracks(self.three_column)
         self.tracks = list_tracks(self.three_column)
         self.tags = list_tags(self.three_column)
         self.track_tags = track_tags(self.three_column, self.tracks, self.tags)